euler314 commited on
Commit
942484e
·
verified ·
1 Parent(s): 82c1030

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1451 -1
app.py CHANGED
@@ -1701,8 +1701,1458 @@ class DownloadManager:
1701
  await asyncio.sleep(2)
1702
  if not st.session_state.get('keep_progress', False):
1703
  progress_text.empty()
1704
- progress_bar.empty()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1705
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1706
  # Utility Functions for New Features
1707
  def extract_keywords(text, n=5):
1708
  doc = nlp_model(text)
 
1701
  await asyncio.sleep(2)
1702
  if not st.session_state.get('keep_progress', False):
1703
  progress_text.empty()
1704
+ progress_bar.empty()class DownloadManager:
1705
+ def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
1706
+ self.use_proxy = use_proxy
1707
+ self.proxy = proxy
1708
+ self.query = query
1709
+ self.num_results = num_results
1710
+ self.playwright = None
1711
+ self.browser = None
1712
+ self.context = None
1713
+ self.page = None
1714
+
1715
+ async def __aenter__(self):
1716
+ self.playwright = await async_playwright().start()
1717
+ opts = {
1718
+ "headless": True,
1719
+ "args": [
1720
+ '--no-sandbox',
1721
+ '--disable-setuid-sandbox',
1722
+ '--disable-dev-shm-usage',
1723
+ '--disable-gpu',
1724
+ '--no-zygote',
1725
+ '--single-process'
1726
+ ]
1727
+ }
1728
+ if self.use_proxy and self.proxy:
1729
+ opts["proxy"] = {"server": self.proxy}
1730
+ self.browser = await self.playwright.chromium.launch(**opts)
1731
+ self.context = await self.browser.new_context(user_agent=get_random_user_agent())
1732
+ self.page = await self.context.new_page()
1733
+ await self.page.set_extra_http_headers({
1734
+ 'Accept-Language': 'en-US,en;q=0.9',
1735
+ 'Accept-Encoding': 'gzip, deflate, br',
1736
+ 'Referer': 'https://www.bing.com/'
1737
+ })
1738
+ return self
1739
+
1740
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
1741
+ if self.browser:
1742
+ await self.browser.close()
1743
+ if self.playwright:
1744
+ await self.playwright.stop()
1745
+
1746
+ async def search_bing(self):
1747
+ urls = []
1748
+ try:
1749
+ search_url = f"https://www.bing.com/search?q={self.query}"
1750
+ await self.page.goto(search_url, timeout=30000)
1751
+ await self.page.wait_for_load_state('networkidle')
1752
+ links = await self.page.query_selector_all("li.b_algo h2 a")
1753
+ for link in links[:self.num_results]:
1754
+ href = await link.get_attribute('href')
1755
+ if href:
1756
+ urls.append(href)
1757
+ return urls
1758
+ except Exception as e:
1759
+ logger.error(f"Error searching Bing: {e}")
1760
+ return []
1761
+
1762
+ async def get_file_size(self, url):
1763
+ try:
1764
+ async with self.context.new_page() as page:
1765
+ response = await page.request.head(url, timeout=15000)
1766
+ length = response.headers.get('Content-Length', None)
1767
+ if length:
1768
+ return sizeof_fmt(int(length))
1769
+ else:
1770
+ return "Unknown Size"
1771
+ except Exception:
1772
+ return "Unknown Size"
1773
+
1774
+ async def get_pdf_metadata(self, url):
1775
+ try:
1776
+ async with self.context.new_page() as page:
1777
+ resp = await page.request.get(url, timeout=15000)
1778
+ if resp.ok:
1779
+ content = await resp.body()
1780
+ pdf = BytesIO(content)
1781
+ reader = PdfReader(pdf)
1782
+ return {
1783
+ 'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A',
1784
+ 'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A',
1785
+ 'Pages': len(reader.pages),
1786
+ }
1787
+ else:
1788
+ return {}
1789
+ except Exception:
1790
+ return {}
1791
+
1792
+ async def extract_real_download_url(self, url):
1793
+ try:
1794
+ async with self.context.new_page() as page:
1795
+ response = await page.goto(url, wait_until='networkidle', timeout=30000)
1796
+ if response and response.headers.get('location'):
1797
+ return response.headers['location']
1798
+ return page.url
1799
+ except Exception as e:
1800
+ logger.error(f"Error extracting real download URL: {e}")
1801
+ return url
1802
+
1803
+ async def extract_downloadable_files(self, url, custom_ext_list):
1804
+ found_files = []
1805
+ try:
1806
+ response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
1807
+ if not response:
1808
+ return []
1809
+
1810
+ final_url = self.page.url
1811
+ if '.php' in final_url or 'download' in final_url:
1812
+ real_url = await self.extract_real_download_url(final_url)
1813
+ if real_url != final_url:
1814
+ found_files.append({
1815
+ 'url': real_url,
1816
+ 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
1817
+ 'size': await self.get_file_size(real_url),
1818
+ 'metadata': {}
1819
+ })
1820
+ return found_files
1821
+
1822
+ await self.page.wait_for_load_state('networkidle', timeout=30000)
1823
+ content = await self.page.content()
1824
+ soup = BeautifulSoup(content, 'html.parser')
1825
+
1826
+ default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
1827
+ '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx',
1828
+ '.pptx', '.odt', '.txt']
1829
+ all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
1830
+
1831
+ parsed_base = urlparse(final_url)
1832
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
1833
+ path_base = os.path.dirname(parsed_base.path)
1834
+
1835
+ # Process all anchor tags
1836
+ for a in soup.find_all('a', href=True):
1837
+ href = a['href'].strip()
1838
+
1839
+ if '.php' in href.lower() or 'download' in href.lower():
1840
+ full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
1841
+ real_url = await self.extract_real_download_url(full_url)
1842
+ if real_url and real_url != full_url:
1843
+ found_files.append({
1844
+ 'url': real_url,
1845
+ 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
1846
+ 'size': await self.get_file_size(real_url),
1847
+ 'metadata': {}
1848
+ })
1849
+ continue
1850
+
1851
+ if any(href.lower().endswith(ext) for ext in all_exts):
1852
+ file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
1853
+ size_str = await self.get_file_size(file_url)
1854
+ meta = {}
1855
+ if file_url.lower().endswith('.pdf'):
1856
+ meta = await self.get_pdf_metadata(file_url)
1857
+ found_files.append({
1858
+ 'url': file_url,
1859
+ 'filename': os.path.basename(file_url.split('?')[0]),
1860
+ 'size': size_str,
1861
+ 'metadata': meta
1862
+ })
1863
+
1864
+ # Handle Google Drive links
1865
+ elif ("drive.google.com" in href) or ("docs.google.com" in href):
1866
+ file_id = None
1867
+ for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
1868
+ match = re.search(pattern, href)
1869
+ if match:
1870
+ file_id = match.group(1)
1871
+ break
1872
+ if file_id:
1873
+ # Get file info to determine type and view-only status
1874
+ file_type, is_view_only = await self.get_google_drive_file_info(file_id)
1875
+
1876
+ # Create a more informative filename based on info
1877
+ filename = f"gdrive_{file_id}"
1878
+ if file_type:
1879
+ filename = f"{filename}.{file_type}"
1880
+
1881
+ size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}")
1882
+
1883
+ found_files.append({
1884
+ 'url': href, # Use original URL
1885
+ 'filename': filename,
1886
+ 'size': size_str,
1887
+ 'metadata': {
1888
+ 'view_only': is_view_only,
1889
+ 'file_type': file_type,
1890
+ 'file_id': file_id
1891
+ }
1892
+ })
1893
+
1894
+ # Also check for files in other elements (iframe, embed, object, etc.)
1895
+ other_elements = soup.find_all(['iframe', 'embed', 'object', 'source'])
1896
+ for elem in other_elements:
1897
+ src = elem.get('src') or elem.get('data')
1898
+ if src and any(src.lower().endswith(ext) for ext in all_exts):
1899
+ file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
1900
+ size_str = await self.get_file_size(file_url)
1901
+ meta = {}
1902
+ if file_url.lower().endswith('.pdf'):
1903
+ meta = await self.get_pdf_metadata(file_url)
1904
+ found_files.append({
1905
+ 'url': file_url,
1906
+ 'filename': os.path.basename(file_url.split('?')[0]),
1907
+ 'size': size_str,
1908
+ 'metadata': meta
1909
+ })
1910
+
1911
+ # Check for file links in onclick attributes
1912
+ onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]')
1913
+ for elem in onclick_elements:
1914
+ onclick = await elem.get_attribute('onclick')
1915
+ urls = re.findall(r'(https?://[^\'"]+)', onclick)
1916
+ for url_match in urls:
1917
+ if any(url_match.lower().endswith(ext) for ext in all_exts):
1918
+ size_str = await self.get_file_size(url_match)
1919
+ meta = {}
1920
+ if url_match.lower().endswith('.pdf'):
1921
+ meta = await self.get_pdf_metadata(url_match)
1922
+ found_files.append({
1923
+ 'url': url_match,
1924
+ 'filename': os.path.basename(url_match.split('?')[0]),
1925
+ 'size': size_str,
1926
+ 'metadata': meta
1927
+ })
1928
+
1929
+ seen_urls = set()
1930
+ unique_files = []
1931
+ for f in found_files:
1932
+ if f['url'] not in seen_urls:
1933
+ seen_urls.add(f['url'])
1934
+ unique_files.append(f)
1935
+ return unique_files
1936
+ except Exception as e:
1937
+ logger.error(f"Error extracting files from {url}: {e}")
1938
+ return []
1939
+
1940
+ async def download_file(self, file_info, save_dir, referer):
1941
+ file_url = file_info['url']
1942
+ fname = file_info['filename']
1943
+ path = os.path.join(save_dir, fname)
1944
+ base, ext = os.path.splitext(fname)
1945
+ counter = 1
1946
+ while os.path.exists(path):
1947
+ path = os.path.join(save_dir, f"{base}_{counter}{ext}")
1948
+ counter += 1
1949
+ os.makedirs(save_dir, exist_ok=True)
1950
+
1951
+ try:
1952
+ # Special handling for Google Drive files
1953
+ if "drive.google.com" in file_url or "docs.google.com" in file_url:
1954
+ # Check if it's marked as view-only in metadata
1955
+ is_view_only = file_info.get('metadata', {}).get('view_only', False)
1956
+
1957
+ # For view-only files, try our most robust approach first
1958
+ if is_view_only:
1959
+ logger.info(f"Attempting to download view-only file: {file_url}")
1960
+ result_path = await self.force_download_viewonly(file_info, path)
1961
+ if result_path:
1962
+ return result_path
1963
+
1964
+ # If that failed, try the regular download approach
1965
+ logger.info("Primary method failed, trying fallback methods")
1966
+
1967
+ # Try regular download methods
1968
+ success = await self.download_from_google_drive(file_url, path)
1969
+ if success:
1970
+ return path
1971
+
1972
+ # If all methods failed for Google Drive, try one last approach
1973
+ logger.warning("All standard methods failed, attempting force download")
1974
+ result_path = await self.force_download_viewonly(file_info, path)
1975
+ return result_path if result_path else None
1976
+
1977
+ # Original code for non-Google Drive downloads
1978
+ async with self.context.new_page() as page:
1979
+ headers = {
1980
+ 'Accept': '*/*',
1981
+ 'Accept-Encoding': 'gzip, deflate, br',
1982
+ 'Referer': referer
1983
+ }
1984
+ response = await page.request.get(file_url, headers=headers, timeout=30000)
1985
+ if response.status == 200:
1986
+ content = await response.body()
1987
+ with open(path, 'wb') as f:
1988
+ f.write(content)
1989
+ return path
1990
+ else:
1991
+ logger.error(f"Download failed with status {response.status}: {file_url}")
1992
+ return None
1993
+ except Exception as e:
1994
+ logger.error(f"Error downloading {file_url}: {e}")
1995
+ return None
1996
+
1997
+ async def force_download_viewonly(self, file_info, save_path):
1998
+ """Completely rewritten method to handle view-only files reliably, especially multi-page PDFs"""
1999
+ try:
2000
+ # Extract file ID
2001
+ file_id = file_info.get('metadata', {}).get('file_id')
2002
+ if not file_id:
2003
+ url = file_info['url']
2004
+ for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
2005
+ match = re.search(pattern, url)
2006
+ if match:
2007
+ file_id = match.group(1)
2008
+ break
2009
+
2010
+ if not file_id:
2011
+ logger.error("Could not extract file ID")
2012
+ return None
2013
+
2014
+ file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
2015
+ base, ext = os.path.splitext(save_path)
2016
+ if not ext:
2017
+ save_path = f"{base}.{file_type}"
2018
+
2019
+ logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
2020
+
2021
+ # Create a dedicated browser instance with better resolution
2022
+ browser = await self.playwright.chromium.launch(
2023
+ headless=True,
2024
+ args=[
2025
+ '--no-sandbox',
2026
+ '--disable-setuid-sandbox',
2027
+ '--disable-dev-shm-usage',
2028
+ '--disable-web-security',
2029
+ '--disable-features=IsolateOrigins,site-per-process',
2030
+ '--disable-site-isolation-trials'
2031
+ ]
2032
+ )
2033
+
2034
+ # Use higher resolution for better quality
2035
+ context = await browser.new_context(
2036
+ viewport={'width': 1600, 'height': 1200},
2037
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
2038
+ device_scale_factor=2.0
2039
+ )
2040
+
2041
+ page = await context.new_page()
2042
+
2043
+ try:
2044
+ # Go to the file view page
2045
+ logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
2046
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
2047
+ await page.wait_for_load_state('networkidle')
2048
+ await page.wait_for_timeout(5000) # Wait longer for everything to load
2049
+
2050
+ # Create temp directory
2051
+ temp_dir = tempfile.mkdtemp()
2052
+
2053
+ # Special handling for PDFs
2054
+ if file_type.lower() == 'pdf':
2055
+ # Check if there's a pagination control
2056
+ pagination_exists = await page.query_selector('div[role="toolbar"] div[role="presentation"] div[role="presentation"]:has-text("/")')
2057
+
2058
+ # Try multiple methods to extract total pages
2059
+ total_pages = await page.evaluate("""
2060
+ () => {
2061
+ // Method 1: Check page counter text
2062
+ const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
2063
+ const text = el.textContent || '';
2064
+ return /\\d+\\s*\\/\\s*\\d+/.test(text);
2065
+ });
2066
+
2067
+ if (pageCounters.length > 0) {
2068
+ const text = pageCounters[0].textContent || '';
2069
+ const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
2070
+ if (match && match[2]) return parseInt(match[2]);
2071
+ }
2072
+
2073
+ // Method 2: Check actual page elements
2074
+ const pageElements = document.querySelectorAll('.drive-viewer-paginated-page');
2075
+ if (pageElements.length > 0) return pageElements.length;
2076
+
2077
+ // Method 3: Look for page thumbnails
2078
+ const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
2079
+ if (thumbnails.length > 0) return thumbnails.length;
2080
+
2081
+ // Fallback: conservative guess based on UI
2082
+ return 50; // Safe default when we can't determine
2083
+ }
2084
+ """)
2085
+
2086
+ logger.info(f"Detected {total_pages} pages in PDF")
2087
+
2088
+ if total_pages <= 1:
2089
+ # Additional check - sometimes the page count detection fails
2090
+ # Let's double-check by looking for next/previous buttons
2091
+ next_button = await page.query_selector('button[aria-label="Next page"]')
2092
+ if next_button:
2093
+ disabled = await next_button.get_attribute('disabled')
2094
+ if not disabled:
2095
+ logger.info("Found next button that's not disabled, document has multiple pages")
2096
+ total_pages = 100 # Set a high number, we'll stop when we can't go further
2097
+
2098
+ # If we still think it's a single page, use a more direct approach
2099
+ if total_pages <= 1:
2100
+ # Single page approach
2101
+ logger.info("Using single-page capture approach")
2102
+
2103
+ # Take a screenshot of the current view (should be the full document or first page)
2104
+ screenshot_path = os.path.join(temp_dir, "page.png")
2105
+
2106
+ # Try to screenshot just the document area if we can find it
2107
+ document_area = await page.query_selector('.drive-viewer-paginated-page')
2108
+ if document_area:
2109
+ await document_area.screenshot(path=screenshot_path)
2110
+ else:
2111
+ # Otherwise take a full screenshot
2112
+ await page.screenshot(path=screenshot_path)
2113
+
2114
+ # Convert to PDF
2115
+ from PIL import Image
2116
+ from reportlab.pdfgen import canvas as pdf_canvas
2117
+
2118
+ img = Image.open(screenshot_path)
2119
+ width, height = img.size
2120
+ c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
2121
+ c.drawImage(screenshot_path, 0, 0, width, height)
2122
+ c.save()
2123
+
2124
+ os.remove(screenshot_path)
2125
+ os.rmdir(temp_dir)
2126
+
2127
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
2128
+ return save_path
2129
+ return None
2130
+
2131
+ # Multi-page approach
2132
+ logger.info(f"Using multi-page capture approach for {total_pages} pages")
2133
+
2134
+ # CRITICAL: We need to go to the first page first
2135
+ # Check if we need to reset to first page
2136
+ current_page_text = await page.evaluate("""
2137
+ () => {
2138
+ const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
2139
+ const text = el.textContent || '';
2140
+ return /\\d+\\s*\\/\\s*\\d+/.test(text);
2141
+ });
2142
+
2143
+ if (pageCounters.length > 0) {
2144
+ return pageCounters[0].textContent || '';
2145
+ }
2146
+ return '';
2147
+ }
2148
+ """)
2149
+
2150
+ current_page = 1
2151
+ if current_page_text:
2152
+ match = re.search(r'(\d+)\s*\/\s*\d+', current_page_text)
2153
+ if match:
2154
+ current_page = int(match.group(1))
2155
+
2156
+ # If we're not on page 1, go back to first page
2157
+ if current_page > 1:
2158
+ logger.info(f"Currently on page {current_page}, navigating back to page 1")
2159
+
2160
+ # Look for an input field where we can directly set the page number
2161
+ page_input = await page.query_selector('input[aria-label="Page"]')
2162
+ if page_input:
2163
+ await page_input.fill("1")
2164
+ await page_input.press("Enter")
2165
+ await page.wait_for_timeout(1000)
2166
+ else:
2167
+ # Use prev button to go back to first page
2168
+ prev_button = await page.query_selector('button[aria-label="Previous page"]')
2169
+ if prev_button:
2170
+ # Keep clicking until we can't anymore
2171
+ for _ in range(current_page - 1):
2172
+ try:
2173
+ await prev_button.click()
2174
+ await page.wait_for_timeout(500)
2175
+ except Exception as e:
2176
+ logger.warning(f"Error clicking prev button: {e}")
2177
+ break
2178
+
2179
+ # Capture each page
2180
+ screenshots = []
2181
+ page_num = 1
2182
+ max_tries = min(total_pages + 10, 200) # Set a reasonable limit
2183
+ next_button = await page.query_selector('button[aria-label="Next page"]')
2184
+
2185
+ # Maximize the PDF view if possible
2186
+ await page.evaluate("""
2187
+ () => {
2188
+ // Try to find and click any "full page" or "maximize" buttons
2189
+ const fullViewButtons = Array.from(document.querySelectorAll('button'))
2190
+ .filter(b => b.textContent?.includes('Full') ||
2191
+ b.getAttribute('aria-label')?.includes('Full') ||
2192
+ b.getAttribute('aria-label')?.includes('fit page'));
2193
+ if (fullViewButtons.length > 0) {
2194
+ fullViewButtons[0].click();
2195
+ }
2196
+ }
2197
+ """)
2198
+
2199
+ await page.wait_for_timeout(1000) # Wait for view to adjust
2200
+
2201
+ while page_num <= max_tries:
2202
+ # Wait for the page to be fully loaded
2203
+ await page.wait_for_timeout(800)
2204
+
2205
+ # Take a screenshot of the current page
2206
+ screenshot_path = os.path.join(temp_dir, f"page_{page_num}.png")
2207
+
2208
+ # Try different methods to identify and capture just the page content
2209
+ page_content = await page.query_selector('.drive-viewer-paginated-page')
2210
+ if page_content:
2211
+ # Found the specific page element
2212
+ await page_content.screenshot(path=screenshot_path)
2213
+ else:
2214
+ # Fall back to screenshot of visible viewport
2215
+ await page.screenshot(path=screenshot_path)
2216
+
2217
+ screenshots.append(screenshot_path)
2218
+ logger.info(f"Captured page {page_num}")
2219
+
2220
+ # Check if we have a disabled next button (reached the end)
2221
+ if next_button:
2222
+ is_disabled = await next_button.get_attribute('disabled')
2223
+ if is_disabled == 'true' or is_disabled == 'disabled' or is_disabled is True:
2224
+ logger.info(f"Reached end of document after {page_num} pages")
2225
+ break
2226
+
2227
+ # Click the next button
2228
+ try:
2229
+ await next_button.click()
2230
+ await page.wait_for_timeout(800) # Wait for page transition
2231
+ page_num += 1
2232
+ except Exception as e:
2233
+ logger.error(f"Error clicking next button: {e}")
2234
+ # Try to get a fresh reference to the button
2235
+ next_button = await page.query_selector('button[aria-label="Next page"]')
2236
+ if not next_button:
2237
+ logger.warning("Next button disappeared, assuming end of document")
2238
+ break
2239
+ else:
2240
+ # Try to find the next button again
2241
+ next_button = await page.query_selector('button[aria-label="Next page"]')
2242
+ if not next_button:
2243
+ logger.warning("Could not find next button, stopping navigation")
2244
+ break
2245
+
2246
+ # Double-check if we've reached the expected total
2247
+ if page_num >= total_pages:
2248
+ logger.info(f"Reached expected total of {total_pages} pages")
2249
+ break
2250
+
2251
+ # Combine screenshots into PDF
2252
+ logger.info(f"Creating PDF from {len(screenshots)} captured pages")
2253
+
2254
+ from PIL import Image
2255
+ from reportlab.lib.pagesizes import letter
2256
+ from reportlab.pdfgen import canvas as pdf_canvas
2257
+
2258
+ # Use the size of the first screenshot to set PDF dimensions
2259
+ if screenshots:
2260
+ try:
2261
+ img = Image.open(screenshots[0])
2262
+ width, height = img.size
2263
+
2264
+ c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
2265
+
2266
+ for screenshot in screenshots:
2267
+ try:
2268
+ if os.path.exists(screenshot) and os.path.getsize(screenshot) > 100:
2269
+ img = Image.open(screenshot)
2270
+ c.drawImage(screenshot, 0, 0, width, height)
2271
+ c.showPage()
2272
+ except Exception as e:
2273
+ logger.error(f"Error adding page to PDF: {e}")
2274
+
2275
+ c.save()
2276
+
2277
+ # Clean up screenshots
2278
+ for screenshot in screenshots:
2279
+ if os.path.exists(screenshot):
2280
+ os.remove(screenshot)
2281
+
2282
+ logger.info(f"Successfully created PDF with {len(screenshots)} pages")
2283
+ except Exception as e:
2284
+ logger.error(f"Error creating PDF: {e}")
2285
+ else:
2286
+ logger.error("No screenshots captured to create PDF")
2287
+ else:
2288
+ # Non-PDF file handling
2289
+ screenshot_path = os.path.join(temp_dir, "file.png")
2290
+ await page.screenshot(path=screenshot_path)
2291
+
2292
+ if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']:
2293
+ # For document types, try to export directly
2294
+ await self.export_google_doc(file_id, file_type, save_path)
2295
+ else:
2296
+ # For other types, save the screenshot with appropriate extension
2297
+ shutil.copy(screenshot_path, save_path)
2298
+
2299
+ os.remove(screenshot_path)
2300
+
2301
+ # Clean up temp directory
2302
+ try:
2303
+ os.rmdir(temp_dir)
2304
+ except:
2305
+ pass
2306
+
2307
+ # Close browser
2308
+ await browser.close()
2309
+
2310
+ # Verify file exists and has content
2311
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
2312
+ logger.info(f"Successfully downloaded file to {save_path}")
2313
+ return save_path
2314
+ else:
2315
+ logger.error(f"Generated file is too small or missing: {save_path}")
2316
+ return None
2317
+
2318
+ except Exception as e:
2319
+ logger.error(f"Error during force download: {e}")
2320
+ if browser:
2321
+ await browser.close()
2322
+ return None
2323
+
2324
+ except Exception as e:
2325
+ logger.error(f"Force download preparation failed: {e}")
2326
+ return None
2327
+
2328
+ async def download_from_google_drive(self, url, save_path):
2329
+ """Enhanced method to download from Google Drive with multiple fallback approaches"""
2330
+ # Extract the file ID from different URL formats
2331
+ file_id = None
2332
+ url_patterns = [
2333
+ r'drive\.google\.com/file/d/([^/]+)',
2334
+ r'drive\.google\.com/open\?id=([^&]+)',
2335
+ r'docs\.google\.com/\w+/d/([^/]+)',
2336
+ r'id=([^&]+)',
2337
+ r'drive\.google\.com/uc\?id=([^&]+)',
2338
+ ]
2339
+
2340
+ for pattern in url_patterns:
2341
+ match = re.search(pattern, url)
2342
+ if match:
2343
+ file_id = match.group(1)
2344
+ break
2345
+
2346
+ if not file_id:
2347
+ logger.error(f"Could not extract file ID from URL: {url}")
2348
+ return False
2349
+
2350
+ # Determine file type first (important for handling different file types)
2351
+ file_type, is_view_only = await self.get_google_drive_file_info(file_id)
2352
+ logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}")
2353
+
2354
+ base, ext = os.path.splitext(save_path)
2355
+ if not ext and file_type:
2356
+ # Add the correct extension if missing
2357
+ save_path = f"{base}.{file_type}"
2358
+
2359
+ # For view-only files, use specialized approaches
2360
+ if is_view_only:
2361
+ # Approach 1: For PDFs, use the JS method
2362
+ if file_type == 'pdf':
2363
+ success = await self.download_viewonly_pdf_with_js(file_id, save_path)
2364
+ if success:
2365
+ return True
2366
+
2367
+ # Approach 2: For Google Docs, Sheets, etc., use export API
2368
+ if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']:
2369
+ success = await self.export_google_doc(file_id, file_type, save_path)
2370
+ if success:
2371
+ return True
2372
+
2373
+ # Approach 3: Try the direct screenshot method for any view-only file
2374
+ success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type)
2375
+ if success:
2376
+ return True
2377
+
2378
+ # Try standard approaches for non-view-only files
2379
+ try:
2380
+ # Try with gdown first
2381
+ import gdown
2382
+ output = gdown.download(f"https://drive.google.com/uc?id={file_id}", save_path, quiet=False, fuzzy=True)
2383
+ if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
2384
+ with open(save_path, 'rb') as f:
2385
+ content = f.read(100) # Read first 100 bytes
2386
+ if b'<!DOCTYPE html>' not in content: # Check not HTML error page
2387
+ logger.info(f"Successfully downloaded with gdown: {url}")
2388
+ return True
2389
+ except Exception as e:
2390
+ logger.warning(f"gdown download failed: {e}")
2391
+
2392
+ # Try with requests and session cookies
2393
+ try:
2394
+ session = requests.Session()
2395
+ session.headers.update({'User-Agent': get_random_user_agent()})
2396
+
2397
+ # Visit the page first to get cookies
2398
+ session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30)
2399
+
2400
+ # Try download
2401
+ url = f"https://drive.google.com/uc?id={file_id}&export=download"
2402
+ response = session.get(url, stream=True, timeout=30)
2403
+
2404
+ # Check for confirmation token
2405
+ confirmation_token = None
2406
+ for k, v in response.cookies.items():
2407
+ if k.startswith('download_warning'):
2408
+ confirmation_token = v
2409
+ break
2410
+
2411
+ # Use confirmation token if found
2412
+ if confirmation_token:
2413
+ url = f"{url}&confirm={confirmation_token}"
2414
+ response = session.get(url, stream=True, timeout=60)
2415
+
2416
+ # Check if we're getting HTML instead of the file
2417
+ content_type = response.headers.get('Content-Type', '')
2418
+ if 'text/html' in content_type:
2419
+ logger.warning("Received HTML instead of file - likely download restriction")
2420
+ else:
2421
+ with open(save_path, 'wb') as f:
2422
+ for chunk in response.iter_content(chunk_size=1024*1024):
2423
+ if chunk:
2424
+ f.write(chunk)
2425
+
2426
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
2427
+ with open(save_path, 'rb') as f:
2428
+ content = f.read(100)
2429
+ if b'<!DOCTYPE html>' not in content:
2430
+ logger.info("Successfully downloaded with requests session")
2431
+ return True
2432
+ except Exception as e:
2433
+ logger.warning(f"Requests session download failed: {e}")
2434
+
2435
+ logger.warning("Standard download methods failed")
2436
+ return False
2437
+
2438
+ async def download_viewonly_pdf_with_js(self, file_id, save_path):
2439
+ """Download view-only PDF using the enhanced blob image caching technique"""
2440
+ try:
2441
+ # Create a dedicated browser instance
2442
+ browser = await self.playwright.chromium.launch(
2443
+ headless=True,
2444
+ args=[
2445
+ '--no-sandbox',
2446
+ '--disable-setuid-sandbox',
2447
+ '--disable-dev-shm-usage',
2448
+ '--disable-web-security'
2449
+ ]
2450
+ )
2451
+
2452
+ context = await browser.new_context(
2453
+ viewport={'width': 1600, 'height': 1200},
2454
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
2455
+ accept_downloads=True # Critical for handling the download event
2456
+ )
2457
+
2458
+ page = await context.new_page()
2459
+
2460
+ try:
2461
+ # Step 1: Navigate to the file
2462
+ logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
2463
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
2464
+ await page.wait_for_load_state('networkidle')
2465
+ await page.wait_for_timeout(5000) # Initial wait for content to load
2466
+
2467
+ # Step 2: Estimate the number of pages
2468
+ estimated_pages = await page.evaluate("""
2469
+ () => {
2470
+ // Look for page counter in the interface
2471
+ const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
2472
+ const text = el.textContent || '';
2473
+ return /\\d+\\s*\\/\\s*\\d+/.test(text);
2474
+ });
2475
+
2476
+ if (pageCounters.length > 0) {
2477
+ const text = pageCounters[0].textContent || '';
2478
+ const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
2479
+ if (match && match[2]) return parseInt(match[2]);
2480
+ }
2481
+
2482
+ // If we can't find a counter, check actual pages
2483
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
2484
+ if (pages.length > 0) return pages.length;
2485
+
2486
+ // Default to a reasonable number if we can't determine
2487
+ return 50;
2488
+ }
2489
+ """)
2490
+
2491
+ logger.info(f"Estimated number of pages: {estimated_pages}")
2492
+
2493
+ # Step 3: Initial scroll to trigger loading
2494
+ logger.info("Initial scroll to bottom to trigger lazy loading...")
2495
+ await page.keyboard.press("End")
2496
+ await page.wait_for_timeout(3000)
2497
+
2498
+ # Step 4: Wait for all pages to load by pressing PageDown and checking blob images
2499
+ logger.info("Waiting for all pages to load...")
2500
+ max_attempts = min(estimated_pages * 3, 300) # Adjust based on document size
2501
+ attempt = 0
2502
+
2503
+ while attempt < max_attempts:
2504
+ # Count blob images (which are the PDF pages)
2505
+ blob_count = await page.evaluate("""
2506
+ Array.from(document.getElementsByTagName('img'))
2507
+ .filter(img => img.src.startsWith('blob:') && img.width > 100)
2508
+ .length
2509
+ """)
2510
+
2511
+ logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
2512
+
2513
+ # If we've loaded enough pages or reached estimated count
2514
+ if blob_count >= estimated_pages:
2515
+ logger.info("All pages appear to be loaded.")
2516
+ break
2517
+
2518
+ # Press PageDown to scroll further and trigger more loading
2519
+ await page.keyboard.press("PageDown")
2520
+ await page.wait_for_timeout(2000) # Wait for content to load
2521
+ attempt += 1
2522
+
2523
+ # Extra wait to ensure everything is fully loaded
2524
+ await page.wait_for_timeout(5000)
2525
+
2526
+ # Step 5: Set up a download event listener
2527
+ download_promise = page.wait_for_event("download")
2528
+
2529
+ # Step 6: Inject the jsPDF script to generate PDF
2530
+ logger.info("Generating PDF from loaded pages...")
2531
+ result = await page.evaluate(r'''
2532
+ (function() {
2533
+ return new Promise((resolve, reject) => {
2534
+ let script = document.createElement("script");
2535
+ script.onload = function () {
2536
+ try {
2537
+ let pdf = new jsPDF();
2538
+ let imgs = document.getElementsByTagName("img");
2539
+ let added = 0;
2540
+
2541
+ // First collect and sort all valid blob images
2542
+ let validImages = [];
2543
+ for (let i = 0; i < imgs.length; i++) {
2544
+ let img = imgs[i];
2545
+ if (!/^blob:/.test(img.src)) continue;
2546
+ if (img.width < 100 || img.height < 100) continue;
2547
+ validImages.push(img);
2548
+ }
2549
+
2550
+ // Sort by vertical position
2551
+ validImages.sort((a, b) => {
2552
+ const rectA = a.getBoundingClientRect();
2553
+ const rectB = b.getBoundingClientRect();
2554
+ return rectA.top - rectB.top;
2555
+ });
2556
+
2557
+ console.log(`Found ${validImages.length} valid page images to add to PDF`);
2558
+
2559
+ // Process each image as a page
2560
+ for (let i = 0; i < validImages.length; i++) {
2561
+ let img = validImages[i];
2562
+ let canvas = document.createElement("canvas");
2563
+ let ctx = canvas.getContext("2d");
2564
+ canvas.width = img.width;
2565
+ canvas.height = img.height;
2566
+ ctx.drawImage(img, 0, 0, img.width, img.height);
2567
+ let imgData = canvas.toDataURL("image/jpeg", 1.0);
2568
+
2569
+ if (added > 0) {
2570
+ pdf.addPage();
2571
+ }
2572
+
2573
+ pdf.addImage(imgData, 'JPEG', 0, 0);
2574
+ added++;
2575
+ }
2576
+
2577
+ pdf.save("download.pdf");
2578
+ resolve({success: true, pageCount: added});
2579
+ } catch (error) {
2580
+ reject({success: false, error: error.toString()});
2581
+ }
2582
+ };
2583
+
2584
+ script.onerror = function() {
2585
+ reject({success: false, error: "Failed to load jsPDF library"});
2586
+ };
2587
+
2588
+ // Use a reliable CDN
2589
+ script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js';
2590
+ document.body.appendChild(script);
2591
+ });
2592
+ })();
2593
+ ''')
2594
+
2595
+ if not result.get('success'):
2596
+ logger.error(f"Error in PDF generation: {result.get('error')}")
2597
+ return False
2598
+
2599
+ logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
2600
+
2601
+ # Step 7: Wait for the download to complete and save the file
2602
+ download = await download_promise
2603
+
2604
+ # Step 8: Save the downloaded file to the specified path
2605
+ await download.save_as(save_path)
2606
+ logger.info(f"Successfully saved PDF to {save_path}")
2607
+
2608
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 1000
2609
+
2610
+ finally:
2611
+ await browser.close()
2612
+
2613
+ except Exception as e:
2614
+ logger.error(f"Error in viewonly PDF download process: {e}")
2615
+ return False
2616
+
2617
+ async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
2618
+ """Download any view-only file by taking screenshots"""
2619
+ try:
2620
+ async with self.context.new_page() as page:
2621
+ # Set high-resolution viewport
2622
+ await page.set_viewport_size({"width": 1600, "height": 1200})
2623
+
2624
+ # Navigate to the file
2625
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
2626
+
2627
+ # Make sure the file is loaded
2628
+ await page.wait_for_load_state('networkidle')
2629
+ await page.wait_for_timeout(3000) # Extra time for rendering
2630
+
2631
+ # Create directory for screenshots if multiple pages
2632
+ base_dir = os.path.dirname(save_path)
2633
+ base_name = os.path.splitext(os.path.basename(save_path))[0]
2634
+ screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots")
2635
+ os.makedirs(screenshots_dir, exist_ok=True)
2636
+
2637
+ # Check if it's a multi-page document
2638
+ is_multi_page = await page.evaluate("""
2639
+ () => {
2640
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
2641
+ return pages.length > 1;
2642
+ }
2643
+ """)
2644
+
2645
+ if is_multi_page and file_type == 'pdf':
2646
+ # For multi-page PDFs, take screenshots of each page
2647
+ page_count = await page.evaluate("""
2648
+ async () => {
2649
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
2650
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
2651
+ const container = document.querySelector('.drive-viewer-paginated-scrollable');
2652
+
2653
+ if (!container || pages.length === 0) return 0;
2654
+
2655
+ // Scroll through to make sure all pages are loaded
2656
+ const scrollHeight = container.scrollHeight;
2657
+ const viewportHeight = container.clientHeight;
2658
+ const scrollStep = viewportHeight;
2659
+
2660
+ for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
2661
+ container.scrollTo(0, scrollPos);
2662
+ await delay(300);
2663
+ }
2664
+
2665
+ // Scroll back to top
2666
+ container.scrollTo(0, 0);
2667
+ await delay(300);
2668
+
2669
+ return pages.length;
2670
+ }
2671
+ """)
2672
+
2673
+ logger.info(f"Found {page_count} pages in document")
2674
+
2675
+ # Take screenshots of each page
2676
+ screenshots = []
2677
+ for i in range(page_count):
2678
+ # Scroll to page
2679
+ await page.evaluate(f"""
2680
+ async () => {{
2681
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
2682
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
2683
+ if (pages.length <= {i}) return false;
2684
+
2685
+ pages[{i}].scrollIntoView();
2686
+ await delay(500);
2687
+ return true;
2688
+ }}
2689
+ """)
2690
+
2691
+ # Take screenshot
2692
+ screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
2693
+ await page.screenshot(path=screenshot_path, clip={
2694
+ 'x': 0,
2695
+ 'y': 0,
2696
+ 'width': 1600,
2697
+ 'height': 1200
2698
+ })
2699
+ screenshots.append(screenshot_path)
2700
+
2701
+ # Combine screenshots into PDF
2702
+ from PIL import Image
2703
+ from reportlab.pdfgen import canvas
2704
+
2705
+ c = canvas.Canvas(save_path)
2706
+ for screenshot in screenshots:
2707
+ img = Image.open(screenshot)
2708
+ width, height = img.size
2709
+
2710
+ # Add page to PDF
2711
+ c.setPageSize((width, height))
2712
+ c.drawImage(screenshot, 0, 0, width, height)
2713
+ c.showPage()
2714
+
2715
+ c.save()
2716
+
2717
+ # Clean up screenshots
2718
+ for screenshot in screenshots:
2719
+ os.remove(screenshot)
2720
+ os.rmdir(screenshots_dir)
2721
+
2722
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 0
2723
+ else:
2724
+ # For single-page or non-PDF files, just take one screenshot
2725
+ screenshot_path = os.path.join(screenshots_dir, "screenshot.png")
2726
+ await page.screenshot(path=screenshot_path, fullPage=True)
2727
+
2728
+ # Convert to requested format if needed
2729
+ if file_type == 'pdf':
2730
+ from PIL import Image
2731
+ from reportlab.pdfgen import canvas
2732
+
2733
+ # Create PDF from screenshot
2734
+ img = Image.open(screenshot_path)
2735
+ width, height = img.size
2736
+
2737
+ c = canvas.Canvas(save_path, pagesize=(width, height))
2738
+ c.drawImage(screenshot_path, 0, 0, width, height)
2739
+ c.save()
2740
+ else:
2741
+ # Just copy the screenshot to the destination with proper extension
2742
+ shutil.copy(screenshot_path, save_path)
2743
+
2744
+ # Clean up
2745
+ os.remove(screenshot_path)
2746
+ os.rmdir(screenshots_dir)
2747
+
2748
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 0
2749
+
2750
+ except Exception as e:
2751
+ logger.error(f"Error taking screenshots: {e}")
2752
+ return False
2753
+
2754
+ async def export_google_doc(self, file_id, file_type, save_path):
2755
+ """Export Google Docs/Sheets/Slides to downloadable formats"""
2756
+ try:
2757
+ # Map file types to export formats
2758
+ export_formats = {
2759
+ 'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # docx
2760
+ 'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
2761
+ 'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # xlsx
2762
+ 'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
2763
+ 'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', # pptx
2764
+ 'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
2765
+ 'pdf': 'application/pdf',
2766
+ }
2767
+
2768
+ export_format = export_formats.get(file_type, 'application/pdf')
2769
+ export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}"
2770
+
2771
+ if 'sheet' in file_type or 'xlsx' in file_type:
2772
+ export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx"
2773
+ elif 'ppt' in file_type or 'presentation' in file_type:
2774
+ export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx"
2775
+ elif file_type == 'pdf':
2776
+ export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf"
2777
+
2778
+ async with self.context.new_page() as page:
2779
+ # Get cookies from the main view page first
2780
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle')
2781
+
2782
+ # Now try the export
2783
+ response = await page.goto(export_url, wait_until='networkidle')
2784
+
2785
+ if response.status == 200:
2786
+ content = await response.body()
2787
+ with open(save_path, 'wb') as f:
2788
+ f.write(content)
2789
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 0
2790
+ else:
2791
+ logger.warning(f"Export failed with status {response.status}")
2792
+ return False
2793
+
2794
+ except Exception as e:
2795
+ logger.error(f"Error exporting Google Doc: {e}")
2796
+ return False
2797
+
2798
+ async def get_google_drive_file_info(self, file_id):
2799
+ """Get file type and view-only status from Google Drive"""
2800
+ file_type = None
2801
+ is_view_only = False
2802
+
2803
+ try:
2804
+ async with self.context.new_page() as page:
2805
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
2806
+
2807
+ # Check if view-only
2808
+ view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
2809
+ is_view_only = view_only_text is not None
2810
+
2811
+ # Check for Google Docs viewer
2812
+ gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
2813
+ gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
2814
+ gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
2815
+
2816
+ if gdocs_viewer:
2817
+ file_type = 'docx'
2818
+ elif gsheets_viewer:
2819
+ file_type = 'xlsx'
2820
+ elif gslides_viewer:
2821
+ file_type = 'pptx'
2822
+ else:
2823
+ # Check for PDF viewer
2824
+ pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
2825
+ if pdf_viewer:
2826
+ file_type = 'pdf'
2827
+ else:
2828
+ # Check for image viewer
2829
+ img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
2830
+ if img_viewer:
2831
+ # Get image type from src
2832
+ img_src = await img_viewer.get_attribute('src')
2833
+ if 'jpg' in img_src or 'jpeg' in img_src:
2834
+ file_type = 'jpg'
2835
+ elif 'png' in img_src:
2836
+ file_type = 'png'
2837
+ else:
2838
+ file_type = 'jpg' # Default to jpg
2839
+ else:
2840
+ # Generic file type fallback
2841
+ file_type = 'pdf' # Default to PDF
2842
+
2843
+ # If still no type, check filename
2844
+ if not file_type:
2845
+ title_element = await page.query_selector('div[role="heading"]')
2846
+ if title_element:
2847
+ title = await title_element.text_content()
2848
+ if title:
2849
+ ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
2850
+ if ext_match:
2851
+ file_type = ext_match.group(1).lower()
2852
+
2853
+ except Exception as e:
2854
+ logger.error(f"Error getting Google Drive file info: {e}")
2855
+ file_type = 'pdf' # Default to PDF if we can't determine
2856
+
2857
+ return file_type, is_view_only
2858
+
2859
+ async def get_sublinks(self, url, limit=10000):
2860
+ """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
2861
+ links = set()
2862
+ try:
2863
+ logger.info(f"Fetching sublinks from: {url}")
2864
+
2865
+ # Go to page and wait for full load
2866
+ await self.page.goto(url, timeout=30000, wait_until='networkidle')
2867
+
2868
+ # Get base URL for resolving relative links
2869
+ parsed_base = urlparse(url)
2870
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
2871
+ path_base = os.path.dirname(parsed_base.path)
2872
+
2873
+ # Check if page has ASP.NET elements which might need special handling
2874
+ is_aspnet = await self.page.evaluate('''
2875
+ () => {
2876
+ return document.querySelector('form#aspnetForm') !== null ||
2877
+ document.querySelector('input[name="__VIEWSTATE"]') !== null;
2878
+ }
2879
+ ''')
2880
+
2881
+ if is_aspnet:
2882
+ logger.info("Detected ASP.NET page, using enhanced extraction method")
2883
+
2884
+ # Try to interact with ASP.NET controls that might reveal more links
2885
+ # Look for dropdowns, buttons, and grid elements
2886
+ dropdowns = await self.page.query_selector_all('select')
2887
+ buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button')
2888
+
2889
+ # Try interacting with dropdowns first
2890
+ for dropdown in dropdowns:
2891
+ try:
2892
+ # Get all options
2893
+ options = await self.page.evaluate('''
2894
+ (dropdown) => {
2895
+ return Array.from(dropdown.options).map(o => o.value);
2896
+ }
2897
+ ''', dropdown)
2898
+
2899
+ # Try selecting each option
2900
+ for option in options:
2901
+ if option:
2902
+ await dropdown.select_option(value=option)
2903
+ await self.page.wait_for_timeout(1000)
2904
+ await self.page.wait_for_load_state('networkidle', timeout=5000)
2905
+
2906
+ # Extract any new links that appeared
2907
+ await self.extract_all_link_types(links, base_url, path_base)
2908
+ except Exception as e:
2909
+ logger.warning(f"Error interacting with dropdown: {e}")
2910
+
2911
+ # Try clicking buttons (but avoid dangerous ones like "delete")
2912
+ safe_buttons = []
2913
+ for button in buttons:
2914
+ button_text = await button.text_content() or ""
2915
+ button_value = await button.get_attribute("value") or ""
2916
+ button_id = await button.get_attribute("id") or ""
2917
+ combined_text = (button_text + button_value + button_id).lower()
2918
+
2919
+ # Skip potentially destructive buttons
2920
+ if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]):
2921
+ continue
2922
+
2923
+ # Prioritize buttons that might show more content
2924
+ if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]):
2925
+ safe_buttons.append(button)
2926
+
2927
+ # Click the safe buttons
2928
+ for button in safe_buttons[:5]: # Limit to first 5 to avoid too many clicks
2929
+ try:
2930
+ await button.click()
2931
+ await self.page.wait_for_timeout(1000)
2932
+ await self.page.wait_for_load_state('networkidle', timeout=5000)
2933
+
2934
+ # Extract any new links that appeared
2935
+ await self.extract_all_link_types(links, base_url, path_base)
2936
+ except Exception as e:
2937
+ logger.warning(f"Error clicking button: {e}")
2938
+
2939
+ # Extract links from the initial page state
2940
+ await self.extract_all_link_types(links, base_url, path_base)
2941
+
2942
+ # Look specifically for links inside grid/table views which are common in ASP.NET applications
2943
+ grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a')
2944
+ for cell in grid_cells:
2945
+ try:
2946
+ href = await cell.get_attribute('href')
2947
+ if href:
2948
+ full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
2949
+ links.add(full_url)
2950
+ except Exception as e:
2951
+ logger.warning(f"Error extracting grid link: {e}")
2952
+
2953
+ # Extract links from onclick attributes and javascript:__doPostBack calls
2954
+ postback_links = await self.page.evaluate('''
2955
+ () => {
2956
+ const results = [];
2957
+ // Find elements with onclick containing __doPostBack
2958
+ const elements = document.querySelectorAll('*[onclick*="__doPostBack"]');
2959
+ for (const el of elements) {
2960
+ // Extract the postback target
2961
+ const onclick = el.getAttribute('onclick') || '';
2962
+ const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/);
2963
+ if (match && match[1]) {
2964
+ // Get the visible text to use as description
2965
+ const text = el.innerText || el.textContent || 'Link';
2966
+ results.push({
2967
+ id: match[1],
2968
+ text: text.trim()
2969
+ });
2970
+ }
2971
+ }
2972
+ return results;
2973
+ }
2974
+ ''')
2975
+
2976
+ # Try interacting with some of the postback links
2977
+ for postback in postback_links[:10]: # Limit to first 10 to avoid too many interactions
2978
+ try:
2979
+ logger.info(f"Trying postback link: {postback['text']} ({postback['id']})")
2980
+ await self.page.evaluate(f'''
2981
+ () => {{
2982
+ if (typeof __doPostBack === 'function') {{
2983
+ __doPostBack('{postback["id"]}', '');
2984
+ }}
2985
+ }}
2986
+ ''')
2987
+ await self.page.wait_for_timeout(1500)
2988
+ await self.page.wait_for_load_state('networkidle', timeout=5000)
2989
+
2990
+ # Extract any new links that appeared
2991
+ await self.extract_all_link_types(links, base_url, path_base)
2992
+ except Exception as e:
2993
+ logger.warning(f"Error with postback: {e}")
2994
+
2995
+ logger.info(f"Found {len(links)} sublinks")
2996
+ return list(links)[:limit]
2997
+
2998
+ except Exception as e:
2999
+ logger.error(f"Error getting sublinks from {url}: {e}")
3000
+ return list(links)[:limit] # Return what we have so far
3001
+
3002
+ async def extract_all_link_types(self, links_set, base_url, path_base):
3003
+ """Extract all types of links from the current page"""
3004
+ # Get all <a> tag links
3005
+ a_links = await self.page.query_selector_all('a[href]')
3006
+ for a in a_links:
3007
+ try:
3008
+ href = await a.get_attribute('href')
3009
+ if href and not href.startswith('javascript:') and not href.startswith('#'):
3010
+ full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
3011
+ links_set.add(full_url)
3012
+ except Exception:
3013
+ pass
3014
+
3015
+ # Get iframe sources
3016
+ iframes = await self.page.query_selector_all('iframe[src]')
3017
+ for iframe in iframes:
3018
+ try:
3019
+ src = await iframe.get_attribute('src')
3020
+ if src and not src.startswith('javascript:') and not src.startswith('about:'):
3021
+ full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
3022
+ links_set.add(full_url)
3023
+ except Exception:
3024
+ pass
3025
+
3026
+ # Get links from onclick attributes that reference URLs
3027
+ onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]')
3028
+ for el in onclick_elements:
3029
+ try:
3030
+ onclick = await el.get_attribute('onclick')
3031
+ urls = re.findall(r'(https?://[^\'"]+)', onclick)
3032
+ for url in urls:
3033
+ links_set.add(url)
3034
+ except Exception:
3035
+ pass
3036
+
3037
+ # Look for URLs in data-* attributes
3038
+ data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]')
3039
+ for el in data_elements:
3040
+ for attr in ['data-url', 'data-href', 'data-src']:
3041
+ try:
3042
+ value = await el.get_attribute(attr)
3043
+ if value and not value.startswith('javascript:'):
3044
+ full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
3045
+ links_set.add(full_url)
3046
+ except Exception:
3047
+ pass
3048
+
3049
+ # Look for special anchor links that might not have href attributes
3050
+ special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a')
3051
+ for anchor in special_anchors:
3052
+ try:
3053
+ href = await anchor.get_attribute('href')
3054
+ if href and not href.startswith('javascript:') and not href.startswith('#'):
3055
+ full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
3056
+ links_set.add(full_url)
3057
+ except Exception:
3058
+ pass
3059
+
3060
+ def resolve_relative_url(self, relative_url, base_url, path_base):
3061
+ """Properly resolve relative URLs considering multiple formats"""
3062
+ if relative_url.startswith('/'):
3063
+ # Absolute path relative to domain
3064
+ return f"{base_url}{relative_url}"
3065
+ elif relative_url.startswith('./'):
3066
+ # Explicit relative path
3067
+ return f"{base_url}{path_base}/{relative_url[2:]}"
3068
+ elif relative_url.startswith('../'):
3069
+ # Parent directory
3070
+ parent_path = '/'.join(path_base.split('/')[:-1])
3071
+ return f"{base_url}{parent_path}/{relative_url[3:]}"
3072
+ else:
3073
+ # Regular relative path
3074
+ return f"{base_url}{path_base}/{relative_url}"
3075
+
3076
+ async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
3077
+ if not custom_ext_list:
3078
+ custom_ext_list = []
3079
+ progress_text = st.empty()
3080
+ progress_bar = st.progress(0)
3081
+ file_count_text = st.empty()
3082
+
3083
+ try:
3084
+ progress_text.text("Analyzing main page...")
3085
+ # Special handling for ASP.NET pages
3086
+ is_aspnet = False
3087
+ try:
3088
+ await self.page.goto(url, timeout=30000, wait_until='networkidle')
3089
+ is_aspnet = await self.page.evaluate('''
3090
+ () => {
3091
+ return document.querySelector('form#aspnetForm') !== null ||
3092
+ document.querySelector('input[name="__VIEWSTATE"]') !== null;
3093
+ }
3094
+ ''')
3095
+ except Exception:
3096
+ pass
3097
+
3098
+ # Extract files from main page
3099
+ main_files = await self.extract_downloadable_files(url, custom_ext_list)
3100
+ initial_count = len(main_files)
3101
+ file_count_text.text(f"Found {initial_count} files on main page")
3102
+
3103
+ # Get sublinks with enhanced method
3104
+ progress_text.text("Getting sublinks...")
3105
+ sublinks = await self.get_sublinks(url, sublink_limit)
3106
+ total_links = len(sublinks)
3107
+ progress_text.text(f"Found {total_links} sublinks to process")
3108
+
3109
+ if not sublinks:
3110
+ progress_bar.progress(1.0)
3111
+ return main_files
3112
+
3113
+ # Process each sublink
3114
+ all_files = main_files
3115
+ for i, sublink in enumerate(sublinks, 1):
3116
+ progress = i / total_links
3117
+ progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
3118
+ progress_bar.progress(progress)
3119
+
3120
+ try:
3121
+ # Use a longer timeout for ASP.NET pages which can be slower
3122
+ sub_timeout = timeout * 2 if is_aspnet else timeout
3123
+
3124
+ # Extract files from sublink with appropriate timeout
3125
+ async with async_timeout(sub_timeout):
3126
+ sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
3127
+ all_files.extend(sub_files)
3128
+ file_count_text.text(f"Found {len(all_files)} total files")
3129
+ except Exception as e:
3130
+ logger.warning(f"Error processing sublink {sublink}: {e}")
3131
+
3132
+ # Deduplicate files
3133
+ seen_urls = set()
3134
+ unique_files = []
3135
+ for f in all_files:
3136
+ if f['url'] not in seen_urls:
3137
+ seen_urls.add(f['url'])
3138
+ unique_files.append(f)
3139
 
3140
+ final_count = len(unique_files)
3141
+ progress_text.text(f"Deep search complete!")
3142
+ file_count_text.text(f"Found {final_count} unique files")
3143
+ progress_bar.progress(1.0)
3144
+ return unique_files
3145
+
3146
+ except Exception as e:
3147
+ logger.error(f"Deep search error: {e}")
3148
+ progress_text.text(f"Error during deep search: {str(e)}")
3149
+ return []
3150
+
3151
+ finally:
3152
+ await asyncio.sleep(2)
3153
+ if not st.session_state.get('keep_progress', False):
3154
+ progress_text.empty()
3155
+ progress_bar.empty()
3156
  # Utility Functions for New Features
3157
  def extract_keywords(text, n=5):
3158
  doc = nlp_model(text)