Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Feb 15

Commit

fb399cc

verified ·

1 Parent(s): 9fa91d7

Update app.py

Browse files

Files changed (1) hide show

app.py +365 -4

app.py CHANGED Viewed

@@ -193,10 +193,371 @@ def nlp_extract_entities(text: str):
 # ---------- AI-enhanced Query Preprocessing -------------
 def ai_preprocess_query(query: str) -> str:
     return query
-# Now I'll add the DownloadManager class...
-# ---------- Download Manager Class -------------
-[Previous DownloadManager class code here...]  # Keep all the existing code from the DownloadManager class
 # ---------- Main Streamlit UI Implementation -------------
 def main():

 # ---------- AI-enhanced Query Preprocessing -------------
 def ai_preprocess_query(query: str) -> str:
     return query
+class DownloadManager:
+    def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
+        self.use_proxy = use_proxy
+        self.proxy = proxy
+        self.query = query
+        self.num_results = num_results
+        self.playwright = None
+        self.browser = None
+        self.context = None
+        self.page = None
+    async def __aenter__(self):
+        self.playwright = await async_playwright().start()
+        opts = {"headless": True}
+        if self.use_proxy and self.proxy:
+            opts["proxy"] = {"server": self.proxy}
+        self.browser = await self.playwright.chromium.launch(**opts)
+        self.context = await self.browser.new_context(user_agent=get_random_user_agent())
+        self.page = await self.context.new_page()
+        await self.page.set_extra_http_headers({
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Referer': 'https://www.bing.com/'
+        })
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self.browser:
+            await self.browser.close()
+        if self.playwright:
+            await self.playwright.stop()
+    async def get_file_size(self, url):
+        try:
+            async with self.context.new_page() as page:
+                response = await page.request.head(url, timeout=15000)
+                length = response.headers.get('Content-Length', None)
+                if length:
+                    return sizeof_fmt(int(length))
+                else:
+                    return "Unknown Size"
+        except Exception:
+            return "Unknown Size"
+    async def get_pdf_metadata(self, url):
+        try:
+            async with self.context.new_page() as page:
+                resp = await page.request.get(url, timeout=15000)
+                if resp.ok:
+                    content = await resp.body()
+                    pdf = BytesIO(content)
+                    reader = PdfReader(pdf)
+                    return {
+                        'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A',
+                        'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A',
+                        'Pages': len(reader.pages),
+                    }
+                else:
+                    return {}
+        except Exception:
+            return {}
+    async def extract_real_download_url(self, url):
+        try:
+            async with self.context.new_page() as page:
+                response = await page.goto(url, wait_until='networkidle', timeout=30000)
+                # Check if the response is a redirect
+                if response and response.headers.get('location'):
+                    return response.headers['location']
+                # Check if response is a file
+                content_type = response.headers.get('content-type', '')
+                if 'text/html' not in content_type.lower():
+                    return url
+                # Look for meta refresh
+                content = await page.content()
+                soup = BeautifulSoup(content, 'html.parser')
+                meta_refresh = soup.find('meta', {'http-equiv': 'refresh'})
+                if meta_refresh:
+                    content = meta_refresh.get('content', '')
+                    if 'url=' in content.lower():
+                        return content.split('url=')[-1].strip()
+                return page.url
+        except Exception as e:
+            logger.error(f"Error extracting real download URL: {e}")
+            return url
+    async def extract_downloadable_files(self, url, custom_ext_list):
+        found_files = []
+        try:
+            # First try to load the page
+            response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
+            if not response:
+                return []
+            final_url = self.page.url
+            # Handle redirects and download scripts
+            if '.php' in final_url or 'download' in final_url or 'get' in final_url:
+                real_url = await self.extract_real_download_url(final_url)
+                if real_url != final_url:
+                    content_type = (await self.page.request.head(real_url)).headers.get('content-type', '')
+                    if content_type and 'text/html' not in content_type.lower():
+                        found_files.append({
+                            'url': real_url,
+                            'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
+                            'size': await self.get_file_size(real_url),
+                            'metadata': {}
+                        })
+                        return found_files
+            await self.page.wait_for_load_state('networkidle', timeout=30000)
+            await human_like_interactions(self.page)
+            content = await self.page.content()
+            soup = BeautifulSoup(content, 'html.parser')
+            # Define extensions to look for
+            default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', '.avi', '.mkv',
+                          '.png', '.jpg', '.jpeg', '.gif', '.xlsx', '.xls', '.ppt', '.pptx', '.txt']
+            all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
+            # Parse base URL for relative links
+            parsed_base = urlparse(final_url)
+            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+            # Find all links
+            for a in soup.find_all('a', href=True):
+                href = a['href'].strip()
+                # Skip empty or javascript links
+                if not href or href.startswith('javascript:') or href == '#':
+                    continue
+                # Handle special cases (PHP scripts, download handlers)
+                if '.php' in href.lower() or 'download' in href.lower() or 'get' in href.lower():
+                    full_url = href if href.startswith('http') else urljoin(base_url, href)
+                    real_url = await self.extract_real_download_url(full_url)
+                    if real_url and real_url != full_url:
+                        size_str = await self.get_file_size(real_url)
+                        found_files.append({
+                            'url': real_url,
+                            'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
+                            'size': size_str,
+                            'metadata': {}
+                        })
+                        continue
+                # Handle direct file links
+                if any(href.lower().endswith(ext) for ext in all_exts):
+                    file_url = href if href.startswith('http') else urljoin(base_url, href)
+                    size_str = await self.get_file_size(file_url)
+                    meta = {}
+                    if file_url.lower().endswith('.pdf'):
+                        meta = await self.get_pdf_metadata(file_url)
+                    found_files.append({
+                        'url': file_url,
+                        'filename': os.path.basename(urlparse(file_url).path),
+                        'size': size_str,
+                        'metadata': meta
+                    })
+                # Handle Google Drive links
+                elif any(x in href for x in ['drive.google.com', 'docs.google.com']):
+                    file_id = None
+                    for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
+                        match = re.search(pattern, href)
+                        if match:
+                            file_id = match.group(1)
+                            break
+                    if file_id:
+                        direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+                        async with self.context.new_page() as page:
+                            try:
+                                response = await page.request.head(direct_url, timeout=15000)
+                                filename = file_id
+                                content_disposition = response.headers.get('content-disposition', '')
+                                if content_disposition:
+                                    filename_match = re.findall('filename="(.+?)"', content_disposition)
+                                    if filename_match:
+                                        filename = filename_match[0]
+                                found_files.append({
+                                    'url': direct_url,
+                                    'filename': filename,
+                                    'size': await self.get_file_size(direct_url),
+                                    'metadata': {}
+                                })
+                            except Exception as e:
+                                logger.error(f"Error processing Google Drive link: {e}")
+            # Make list unique based on URLs
+            seen_urls = set()
+            unique_files = []
+            for f in found_files:
+                if f['url'] not in seen_urls:
+                    seen_urls.add(f['url'])
+                    unique_files.append(f)
+            return unique_files
+        except Exception as e:
+            logger.error(f"Error extracting files from {url}: {e}")
+            return []
+    async def download_file(self, file_info, save_dir, referer):
+        file_url = file_info['url']
+        fname = file_info['filename']
+        path = os.path.join(save_dir, fname)
+        # Handle duplicate filenames
+        base, ext = os.path.splitext(fname)
+        counter = 1
+        while os.path.exists(path):
+            path = os.path.join(save_dir, f"{base}_{counter}{ext}")
+            counter += 1
+        os.makedirs(save_dir, exist_ok=True)
+        try:
+            # Special handling for Google Drive
+            if 'drive.google.com' in file_url:
+                import gdown
+                try:
+                    st.write(f"Downloading from Google Drive: {fname}")
+                    output = gdown.download(file_url, path, quiet=False)
+                    if output:
+                        return path
+                    return None
+                except Exception as e:
+                    logger.error(f"Google Drive download error: {e}")
+                    return None
+            # Handle normal downloads
+            async with self.context.new_page() as page:
+                st.write(f"Downloading: {fname}")
+                headers = {
+                    'Accept': '*/*',
+                    'Accept-Encoding': 'gzip, deflate, br',
+                    'Referer': referer
+                }
+                response = await page.request.get(file_url, headers=headers, timeout=30000)
+                if response.status == 200:
+                    content = await response.body()
+                    with open(path, 'wb') as f:
+                        f.write(content)
+                    return path
+                else:
+                    logger.error(f"Download failed with status {response.status}: {file_url}")
+                    return None
+        except Exception as e:
+            logger.error(f"Error downloading {file_url}: {e}")
+            return None
+    async def search_bing(self):
+        if not self.query:
+            return [], []
+        search_query = self.query
+        if "filetype:pdf" not in search_query.lower():
+            search_query += " filetype:pdf"
+        search_url = f"https://www.bing.com/search?q={search_query}&count={self.num_results}"
+        try:
+            await self.page.goto(search_url, timeout=30000)
+            await self.page.wait_for_selector('li.b_algo', timeout=30000)
+            await human_like_scroll(self.page)
+            results = []
+            elements = await self.page.query_selector_all('li.b_algo')
+            for element in elements:
+                link = await element.query_selector('h2 a')
+                if link:
+                    url = await link.get_attribute('href')
+                    if url:
+                        results.append(url)
+            return results[:self.num_results]
+        except Exception as e:
+            logger.error(f"Bing search error: {e}")
+            return []
+    async def get_sublinks(self, url, limit=100):
+        try:
+            await self.page.goto(url, timeout=30000)
+            content = await self.page.content()
+            soup = BeautifulSoup(content, 'html.parser')
+            parsed_base = urlparse(url)
+            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+            links = set()
+            for a in soup.find_all('a', href=True):
+                href = a['href'].strip()
+                if href.startswith('http'):
+                    links.add(href)
+                elif href.startswith('/'):
+                    links.add(f"{base_url}{href}")
+            return list(links)[:limit]
+        except Exception as e:
+            logger.error(f"Error getting sublinks: {e}")
+            return []
+    async def deep_search(self, url, custom_ext_list=None, sublink_limit=100):
+        if not custom_ext_list:
+            custom_ext_list = []
+        progress_text = st.empty()
+        progress_bar = st.progress(0)
+        try:
+            # Search main page
+            progress_text.text("Analyzing main page...")
+            main_files = await self.extract_downloadable_files(url, custom_ext_list)
+            # Get and search sublinks
+            progress_text.text("Getting sublinks...")
+            sublinks = await self.get_sublinks(url, sublink_limit)
+            if not sublinks:
+                progress_bar.progress(1.0)
+                return main_files
+            # Process sublinks
+            all_files = main_files
+            total_links = len(sublinks)
+            for i, sublink in enumerate(sublinks, 1):
+                progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
+                progress_bar.progress(i/total_links)
+                sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
+                all_files.extend(sub_files)
+            # Make results unique
+            seen_urls = set()
+            unique_files = []
+            for f in all_files:
+                if f['url'] not in seen_urls:
+                    seen_urls.add(f['url'])
+                    unique_files.append(f)
+            progress_text.text(f"Found {len(unique_files)} unique files")
+            progress_bar.progress(1.0)
+            return unique_files
+        except Exception as e:
+            logger.error(f"Deep search error: {e}")
+            return []
 # ---------- Main Streamlit UI Implementation -------------
 def main():