Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Feb 15

Commit

6b8a747

verified ·

1 Parent(s): d17208b

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -237

app.py CHANGED Viewed

@@ -8,10 +8,10 @@ import sys
 def install_playwright_dependencies():
     try:
-        # Use apt-get directly without sudo
         os.system('apt-get update -y')
-        # Install required dependencies
         dependencies = [
             'libnss3',
             'libnspr4',
@@ -20,7 +20,9 @@ def install_playwright_dependencies():
             'libcups2',
             'libxcomposite1',
             'libxdamage1',
-            'libatspi2.0-0'
         ]
         dependency_command = f"apt-get install -y {' '.join(dependencies)}"
@@ -281,7 +283,7 @@ class DownloadManager:
         self.browser = None
         self.context = None
         self.page = None
     async def __aenter__(self):
         self.playwright = await async_playwright().start()
         opts = {"headless": True}
@@ -337,25 +339,12 @@ class DownloadManager:
         try:
             async with self.context.new_page() as page:
                 response = await page.goto(url, wait_until='networkidle', timeout=30000)
-                # Check if the response is a redirect
                 if response and response.headers.get('location'):
                     return response.headers['location']
-                # Check if response is a file
                 content_type = response.headers.get('content-type', '')
                 if 'text/html' not in content_type.lower():
                     return url
-                # Look for meta refresh
                 content = await page.content()
-                soup = BeautifulSoup(content, 'html.parser')
-                meta_refresh = soup.find('meta', {'http-equiv': 'refresh'})
-                if meta_refresh:
-                    content = meta_refresh.get('content', '')
-                    if 'url=' in content.lower():
-                        return content.split('url=')[-1].strip()
                 return page.url
         except Exception as e:
             logger.error(f"Error extracting real download URL: {e}")
@@ -364,82 +353,53 @@ class DownloadManager:
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
-            # First try to load the page
             response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
             if not response:
                 return []
             final_url = self.page.url
-            # Handle redirects and download scripts
-            if '.php' in final_url or 'download' in final_url or 'get' in final_url:
                 real_url = await self.extract_real_download_url(final_url)
                 if real_url != final_url:
-                    content_type = (await self.page.request.head(real_url)).headers.get('content-type', '')
-                    if content_type and 'text/html' not in content_type.lower():
-                        found_files.append({
-                            'url': real_url,
-                            'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
-                            'size': await self.get_file_size(real_url),
-                            'metadata': {}
-                        })
-                        return found_files
             await self.page.wait_for_load_state('networkidle', timeout=30000)
-            await human_like_interactions(self.page)
             content = await self.page.content()
             soup = BeautifulSoup(content, 'html.parser')
-            # Define extensions to look for
-            default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', '.avi', '.mkv',
-                          '.png', '.jpg', '.jpeg', '.gif', '.xlsx', '.xls', '.ppt', '.pptx', '.txt']
             all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
-            # Parse base URL for relative links
             parsed_base = urlparse(final_url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
-            # Find all links
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
-                # Skip empty or javascript links
-                if not href or href.startswith('javascript:') or href == '#':
-                    continue
-                # Handle special cases (PHP scripts, download handlers)
-                if '.php' in href.lower() or 'download' in href.lower() or 'get' in href.lower():
-                    full_url = href if href.startswith('http') else urljoin(base_url, href)
-                    real_url = await self.extract_real_download_url(full_url)
-                    if real_url and real_url != full_url:
-                        size_str = await self.get_file_size(real_url)
-                        found_files.append({
-                            'url': real_url,
-                            'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
-                            'size': size_str,
-                            'metadata': {}
-                        })
-                        continue
-                # Handle direct file links
                 if any(href.lower().endswith(ext) for ext in all_exts):
-                    file_url = href if href.startswith('http') else urljoin(base_url, href)
                     size_str = await self.get_file_size(file_url)
                     meta = {}
                     if file_url.lower().endswith('.pdf'):
                         meta = await self.get_pdf_metadata(file_url)
                     found_files.append({
                         'url': file_url,
-                        'filename': os.path.basename(urlparse(file_url).path),
                         'size': size_str,
                         'metadata': meta
                     })
-                # Handle Google Drive links
-                elif any(x in href for x in ['drive.google.com', 'docs.google.com']):
                     file_id = None
                     for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
                         match = re.search(pattern, href)
@@ -449,35 +409,25 @@ class DownloadManager:
                     if file_id:
                         direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
-                        async with self.context.new_page() as page:
-                            try:
-                                response = await page.request.head(direct_url, timeout=15000)
-                                filename = file_id
-                                content_disposition = response.headers.get('content-disposition', '')
-                                if content_disposition:
-                                    filename_match = re.findall('filename="(.+?)"', content_disposition)
-                                    if filename_match:
-                                        filename = filename_match[0]
-                                found_files.append({
-                                    'url': direct_url,
-                                    'filename': filename,
-                                    'size': await self.get_file_size(direct_url),
-                                    'metadata': {}
-                                })
-                            except Exception as e:
-                                logger.error(f"Error processing Google Drive link: {e}")
-            # Make list unique based on URLs
-            seen_urls = set()
-            unique_files = []
-            for f in found_files:
-                if f['url'] not in seen_urls:
-                    seen_urls.add(f['url'])
-                    unique_files.append(f)
-            return unique_files
         except Exception as e:
             logger.error(f"Error extracting files from {url}: {e}")
             return []
@@ -486,8 +436,6 @@ class DownloadManager:
         file_url = file_info['url']
         fname = file_info['filename']
         path = os.path.join(save_dir, fname)
-        # Handle duplicate filenames
         base, ext = os.path.splitext(fname)
         counter = 1
         while os.path.exists(path):
@@ -497,8 +445,7 @@ class DownloadManager:
         os.makedirs(save_dir, exist_ok=True)
         try:
-            # Special handling for Google Drive
-            if 'drive.google.com' in file_url:
                 import gdown
                 try:
                     st.write(f"Downloading from Google Drive: {fname}")
@@ -510,7 +457,6 @@ class DownloadManager:
                     logger.error(f"Google Drive download error: {e}")
                     return None
-            # Handle normal downloads
             async with self.context.new_page() as page:
                 st.write(f"Downloading: {fname}")
@@ -637,7 +583,6 @@ class DownloadManager:
             logger.error(f"Deep search error: {e}")
             return []
-# ---------- Main Streamlit UI Implementation -------------
 def main():
     if 'initialized' not in st.session_state:
         st.session_state.initialized = True
@@ -647,7 +592,6 @@ def main():
     st.title("Advanced File Downloader")
-    # Sidebar for settings
     with st.sidebar:
         st.header("Settings")
         mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
@@ -657,123 +601,63 @@ def main():
                 "Custom File Extensions",
                 placeholder=".csv, .txt, .epub"
             )
-            max_concurrency = st.slider(
-                "Max Concurrency",
-                min_value=1,
-                max_value=1000,
-                value=200
-            )
             use_proxy = st.checkbox("Use Proxy")
             proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
-    # Google OAuth Section
-    with st.expander("Google Drive Integration"):
-        if st.button("Start Google Sign-In"):
-            auth_url = get_google_auth_url()
-            st.markdown(f"[Click here to authorize]({auth_url})")
-        auth_code = st.text_input("Enter authorization code")
-        if st.button("Complete Sign-In") and auth_code:
-            creds, msg = exchange_code_for_credentials(auth_code)
-            st.session_state.google_creds = creds
-            st.write(msg)
-    # Main content area
     if mode == "Manual URL":
         st.header("Manual URL Mode")
         url = st.text_input("Enter URL", placeholder="https://example.com")
-        col1, col2 = st.columns(2)
-        with col1:
-            if st.button("Deep Search", use_container_width=True):
-                if url:
-                    async def run_deep_search():
-                        async with DownloadManager(
-                            use_proxy=use_proxy,
-                            proxy=proxy
-                        ) as dm:
-                            with st.spinner("Searching for files..."):
-                                files = await dm.deep_search(
-                                    url=url,
-                                    custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
-                                    max_concurrency=max_concurrency
-                                )
-                                st.session_state.discovered_files = files
-                                st.session_state.current_url = url
-                                return files
-                    files = asyncio.run(run_deep_search())
-                    if files:
-                        st.success(f"Found {len(files)} files!")
-                    else:
-                        st.warning("No files found.")
-        with col2:
-            if st.button("Preview Page", use_container_width=True):
-                if url:
-                    async def preview():
-                        async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
-                            with st.spinner("Loading preview..."):
-                                return await dm.preview_page(url)
-                    preview_html = asyncio.run(preview())
-                    st.markdown(preview_html, unsafe_allow_html=True)
-        # File selection and download section
-        if st.session_state.discovered_files:
-            with st.expander("Download Options", expanded=True):
-                file_options = [f"{f['filename']} ({f['size']})" for f in st.session_state.discovered_files]
-                selected_indices = st.multiselect(
-                    "Select files to download",
-                    range(len(file_options)),
-                    format_func=lambda x: file_options[x]
-                )
-                if selected_indices:
-                    download_dir = st.text_input("Download Directory", value="./downloads")
-                    delete_after = st.checkbox("Delete after creating ZIP?")
-                    upload_drive = st.checkbox("Upload to Google Drive?")
-                    if st.button("Download Selected"):
-                        selected_files = [st.session_state.discovered_files[i] for i in selected_indices]
-                        async def download_files():
-                            async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
-                                paths = []
-                                for file_info in selected_files:
-                                    with st.spinner(f"Downloading {file_info['filename']}..."):
-                                        path = await dm.download_file(
-                                            file_info,
-                                            download_dir,
-                                            st.session_state.current_url
-                                        )
-                                        if path:
-                                            paths.append(path)
-                                return paths
-                        downloaded_paths = asyncio.run(download_files())
-                        if downloaded_paths:
-                            st.success(f"Successfully downloaded {len(downloaded_paths)} files!")
-                            # Create ZIP if needed
-                            if len(downloaded_paths) > 1 or delete_after or upload_drive:
-                                with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as tmp:
-                                    with zipfile.ZipFile(tmp.name, 'w') as zf:
-                                        for p in downloaded_paths:
-                                            zf.write(p, arcname=os.path.basename(p))
-                                    if upload_drive and st.session_state.google_creds:
-                                        file_id = google_drive_upload(tmp.name, st.session_state.google_creds)
-                                        if file_id and not isinstance(file_id, str):
-                                            st.success(f"Uploaded to Google Drive! File ID: {file_id}")
-                                        else:
-                                            st.error("Failed to upload to Google Drive")
-                                    if delete_after:
-                                        for p in downloaded_paths:
-                                            try:
-                                                os.remove(p)
-                                            except:
-                                                pass
     elif mode == "Bing Search":
         st.header("Bing Search Mode")
@@ -790,35 +674,52 @@ def main():
                         num_results=num_results
                     ) as dm:
                         with st.spinner("Searching..."):
-                            return await dm.search_bing()
-                urls, info = asyncio.run(run_search())
-                if urls:
-                    st.success(f"Found {len(urls)} results!")
-                    for i, (url, info) in enumerate(zip(urls, info), 1):
-                        with st.expander(f"Result {i}: {url}", expanded=i==1):
-                            st.write(f"Snippet: {info['snippet']}")
-                            if info['entities']:
-                                st.write("Entities:", ', '.join(f"{e[0]} ({e[1]})" for e in info['entities']))
-                            if st.button(f"Deep Search This Result {i}"):
-                                st.session_state.current_url = url
-                                async def search_result():
-                                    async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
-                                        return await dm.deep_search(
-                                            url=url,
-                                            custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
-                                            max_concurrency=max_concurrency
-                                        )
-                                files = asyncio.run(search_result())
-                                if files:
-                                    st.session_state.discovered_files = files
-                                    st.success(f"Found {len(files)} files!")
-                                else:
-                                    st.warning("No files found.")
-                else:
-                    st.warning("No results found.")
     else:  # PDF Summarizer mode
         st.header("PDF Summarizer")
@@ -826,9 +727,14 @@ def main():
         if st.button("Summarize"):
             if pdf_url:
-                summary = summarize_pdf_url(pdf_url)
-                st.write("Summary:")
-                st.write(summary)
 if __name__ == "__main__":
-    main()

 def install_playwright_dependencies():
     try:
+        # Update package list
         os.system('apt-get update -y')
+        # Install required dependencies including GTK
         dependencies = [
             'libnss3',
             'libnspr4',
             'libcups2',
             'libxcomposite1',
             'libxdamage1',
+            'libatspi2.0-0',
+            'libgtk-3-0',  # Add GTK dependencies
+            'libgdk-3-0'
         ]
         dependency_command = f"apt-get install -y {' '.join(dependencies)}"
         self.browser = None
         self.context = None
         self.page = None
     async def __aenter__(self):
         self.playwright = await async_playwright().start()
         opts = {"headless": True}
         try:
             async with self.context.new_page() as page:
                 response = await page.goto(url, wait_until='networkidle', timeout=30000)
                 if response and response.headers.get('location'):
                     return response.headers['location']
                 content_type = response.headers.get('content-type', '')
                 if 'text/html' not in content_type.lower():
                     return url
                 content = await page.content()
                 return page.url
         except Exception as e:
             logger.error(f"Error extracting real download URL: {e}")
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
             response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
             if not response:
                 return []
             final_url = self.page.url
+            if '.php' in final_url or 'download' in final_url:
                 real_url = await self.extract_real_download_url(final_url)
                 if real_url != final_url:
+                    found_files.append({
+                        'url': real_url,
+                        'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
+                        'size': await self.get_file_size(real_url),
+                        'metadata': {}
+                    })
+                    return found_files
             await self.page.wait_for_load_state('networkidle', timeout=30000)
             content = await self.page.content()
             soup = BeautifulSoup(content, 'html.parser')
+            default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
+                          '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
             all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
             parsed_base = urlparse(final_url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
                 if any(href.lower().endswith(ext) for ext in all_exts):
+                    file_url = href if href.startswith('http') else (
+                        f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
+                    )
                     size_str = await self.get_file_size(file_url)
                     meta = {}
                     if file_url.lower().endswith('.pdf'):
                         meta = await self.get_pdf_metadata(file_url)
                     found_files.append({
                         'url': file_url,
+                        'filename': os.path.basename(file_url.split('?')[0]),
                         'size': size_str,
                         'metadata': meta
                     })
+                elif ("drive.google.com" in href) or ("docs.google.com" in href):
                     file_id = None
                     for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
                         match = re.search(pattern, href)
                     if file_id:
                         direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+                        filename = file_id
+                        try:
+                            response = await self.page.request.head(direct_url, timeout=15000)
+                            cd = response.headers.get("Content-Disposition", "")
+                            if cd:
+                                mt = re.search(r'filename\*?="?([^";]+)', cd)
+                                if mt:
+                                    filename = mt.group(1).strip('"').strip()
+                            found_files.append({
+                                'url': direct_url,
+                                'filename': filename,
+                                'size': await self.get_file_size(direct_url),
+                                'metadata': {}
+                            })
+                        except Exception as e:
+                            logger.error(f"Error processing Google Drive link: {e}")
+            return found_files
         except Exception as e:
             logger.error(f"Error extracting files from {url}: {e}")
             return []
         file_url = file_info['url']
         fname = file_info['filename']
         path = os.path.join(save_dir, fname)
         base, ext = os.path.splitext(fname)
         counter = 1
         while os.path.exists(path):
         os.makedirs(save_dir, exist_ok=True)
         try:
+            if "drive.google.com" in file_url:
                 import gdown
                 try:
                     st.write(f"Downloading from Google Drive: {fname}")
                     logger.error(f"Google Drive download error: {e}")
                     return None
             async with self.context.new_page() as page:
                 st.write(f"Downloading: {fname}")
             logger.error(f"Deep search error: {e}")
             return []
 def main():
     if 'initialized' not in st.session_state:
         st.session_state.initialized = True
     st.title("Advanced File Downloader")
     with st.sidebar:
         st.header("Settings")
         mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
                 "Custom File Extensions",
                 placeholder=".csv, .txt, .epub"
             )
             use_proxy = st.checkbox("Use Proxy")
             proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
     if mode == "Manual URL":
         st.header("Manual URL Mode")
         url = st.text_input("Enter URL", placeholder="https://example.com")
+        if st.button("Deep Search", use_container_width=True):
+            if url:
+                async def run_deep_search():
+                    async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
+                        with st.spinner("Searching for files..."):
+                            files = await dm.deep_search(
+                                url=url,
+                                custom_ext_list=custom_extensions.split(',') if custom_extensions else []
+                            )
+                            st.session_state.discovered_files = files
+                            st.session_state.current_url = url
+                            return files
+                files = asyncio.run(run_deep_search())
+                if files:
+                    st.success(f"Found {len(files)} files!")
+                    # Display files
+                    for file in files:
+                        st.write(f"- {file['filename']} ({file['size']})")
+                    # Download section
+                    selected_files = st.multiselect(
+                        "Select files to download",
+                        range(len(files)),
+                        format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})"
+                    )
+                    if selected_files:
+                        download_dir = st.text_input("Download Directory", value="./downloads")
+                        if st.button("Download Selected"):
+                            async def download_files():
+                                async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
+                                    paths = []
+                                    for idx in selected_files:
+                                        with st.spinner(f"Downloading {files[idx]['filename']}..."):
+                                            path = await dm.download_file(
+                                                files[idx],
+                                                download_dir,
+                                                url
+                                            )
+                                            if path:
+                                                paths.append(path)
+                                    return paths
+                            downloaded = asyncio.run(download_files())
+                            if downloaded:
+                                st.success(f"Successfully downloaded {len(downloaded)} files to {download_dir}")
+                else:
+                    st.warning("No files found.")
     elif mode == "Bing Search":
         st.header("Bing Search Mode")
                         num_results=num_results
                     ) as dm:
                         with st.spinner("Searching..."):
+                            urls = await dm.search_bing()
+                            if urls:
+                                st.success(f"Found {len(urls)} results!")
+                                for i, url in enumerate(urls, 1):
+                                    with st.expander(f"Result {i}: {url}", expanded=i==1):
+                                        if st.button(f"Deep Search This Result {i}"):
+                                            files = await dm.deep_search(
+                                                url=url,
+                                                custom_ext_list=custom_extensions.split(',') if custom_extensions else []
+                                            )
+                                            if files:
+                                                st.session_state.discovered_files = files
+                                                st.session_state.current_url = url
+                                                st.success(f"Found {len(files)} files!")
+                                                # Display and download section
+                                                for file in files:
+                                                    st.write(f"- {file['filename']} ({file['size']})")
+                                                selected_files = st.multiselect(
+                                                    "Select files to download",
+                                                    range(len(files)),
+                                                    format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})"
+                                                )
+                                                if selected_files:
+                                                    download_dir = st.text_input("Download Directory", value="./downloads")
+                                                    if st.button("Download Selected Files"):
+                                                        paths = []
+                                                        for idx in selected_files:
+                                                            with st.spinner(f"Downloading {files[idx]['filename']}..."):
+                                                                path = await dm.download_file(
+                                                                    files[idx],
+                                                                    download_dir,
+                                                                    url
+                                                                )
+                                                                if path:
+                                                                    paths.append(path)
+                                                        if paths:
+                                                            st.success(f"Successfully downloaded {len(paths)} files to {download_dir}")
+                                            else:
+                                                st.warning("No files found on this page.")
+                            else:
+                                st.warning("No search results found.")
+                asyncio.run(run_search())
     else:  # PDF Summarizer mode
         st.header("PDF Summarizer")
         if st.button("Summarize"):
             if pdf_url:
+                with st.spinner("Generating summary..."):
+                    summary = summarize_pdf_url(pdf_url)
+                    st.write("Summary:")
+                    st.write(summary)
 if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        st.error(f"An error occurred: {str(e)}")
+        logger.error(f"Application error: {str(e)}", exc_info=True)