Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Mar 10

Commit

65b12b7

verified ·

1 Parent(s): f4280e9

Update app.py

Browse files

Files changed (1) hide show

app.py +394 -48

app.py CHANGED Viewed

@@ -32,6 +32,14 @@ import googleapiclient.discovery
 import google.auth.transport.requests
 import googleapiclient.http
 # -------------------- Logging Setup --------------------
 logging.basicConfig(
     level=logging.INFO,
@@ -88,6 +96,77 @@ PROXY_ROTATION_CONFIG = {
     "proxies": []  # Will be populated from the UI if needed
 }
 # -------------------- Utility Functions --------------------
 def get_random_user_agent():
     return random.choice(USER_AGENTS)
@@ -857,6 +936,118 @@ class DownloadManager:
             logger.error(f"Error getting exam links: {e}")
             return []
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
@@ -1176,6 +1367,17 @@ class DownloadManager:
                             'metadata': {}
                         })
             # Deduplicate files by URL
             seen_urls = set()
             unique_files = []
@@ -2725,6 +2927,9 @@ def main():
         st.session_state.do_deep_search = False
         st.session_state.deep_search_url = None
         st.session_state.search_results = []
     with st.sidebar:
         mode = st.radio("Select Mode", ["Manual URL", "Bing Search"], key="mode_select")
@@ -2814,6 +3019,99 @@ def main():
         if st.session_state.discovered_files:
             files = st.session_state.discovered_files
             col1, col2 = st.columns([1, 4])
             with col1:
                 if st.button("Select All", key="select_all_btn"):
@@ -2821,43 +3119,19 @@ def main():
                 if st.button("Clear Selection", key="clear_selection_btn"):
                     st.session_state.selected_files = []
-            # Create a formatted display of files with metadata
-            file_options = []
-            for i, file in enumerate(files):
-                filename = file['filename']
-                size = file['size']
-                meta = file.get('metadata', {})
-                # Format display string with relevant metadata
-                if meta and 'Pages' in meta:
-                    file_info = f"{filename} ({size}) - {meta.get('Pages', '')} pages"
-                else:
-                    file_info = f"{filename} ({size})"
-                file_options.append((i, file_info))
-            selected_indices = st.multiselect(
-                "Select files to download",
-                options=[i for i, _ in file_options],
-                default=st.session_state.selected_files,
-                format_func=lambda i: next(info for idx, info in file_options if idx == i),
-                key="file_multiselect"
-            )
-            st.session_state.selected_files = selected_indices
-            if selected_indices:
-                col1, col2, col3, col4 = st.columns(4)
-                with col1:
                     download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
-                with col2:
                     create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
-                with col3:
                     delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
-                with col4:
                     upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
-                if st.button("Download Selected", key="download_btn"):
                     if not os.path.exists(download_dir):
                         os.makedirs(download_dir)
@@ -2871,10 +3145,10 @@ def main():
                             proxy=proxy,
                             use_stealth=use_stealth
                         ) as dm:
-                            for i, idx in enumerate(selected_indices):
-                                progress = (i + 1) / len(selected_indices)
                                 file_info = files[idx]
-                                status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_indices)})")
                                 progress_bar.progress(progress)
                                 path = await dm.download_file(file_info, download_dir, url)
@@ -2925,23 +3199,49 @@ def main():
                                     except Exception as e:
                                         st.warning(f"Could not delete {path}: {e}")
                                 st.info("Deleted original files after ZIP creation")
-                        else:
-                            # Provide individual file downloads
-                            st.write("Download files individually:")
-                            for path in downloaded:
-                                with open(path, "rb") as f:
                                     file_data = f.read()
-                                file_name = os.path.basename(path)
-                                mime_type = mimetypes.guess_type(path)[0] or "application/octet-stream"
                                 st.download_button(
-                                    label=f"Download {file_name}",
                                     data=file_data,
-                                    file_name=file_name,
-                                    mime=mime_type,
-                                    key=f"download_file_{path}"
                                 )
     elif mode == "Bing Search":
         st.header("Bing Search Mode")
@@ -3007,6 +3307,52 @@ def main():
                     st.session_state.discovered_files = files
                     st.session_state.current_url = url
                     st.success(f"Found {len(files)} files!")
                 else:
                     st.warning("No files found.")
@@ -3054,7 +3400,7 @@ def main():
     # Add footer with attribution
     st.markdown('---')
-    st.markdown('Created by [Euler314](https://github.com/euler314)')
 if __name__ == "__main__":
     main()

 import google.auth.transport.requests
 import googleapiclient.http
+# New imports for RAG search
+import nltk
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import docx2txt
+import PyPDF2
 # -------------------- Logging Setup --------------------
 logging.basicConfig(
     level=logging.INFO,
     "proxies": []  # Will be populated from the UI if needed
 }
+# -------------------- RAG Search Class --------------------
+class RAGSearch:
+    def __init__(self):
+        self.file_texts = []
+        self.file_metadata = []
+        self.vectorizer = TfidfVectorizer(stop_words='english')
+        self.vectors = None
+    def add_file(self, file_data, file_info):
+        """Add a file to the search index"""
+        file_ext = os.path.splitext(file_info['filename'])[1]
+        text = self.extract_text(file_data, file_ext)
+        if text:
+            self.file_texts.append(text)
+            self.file_metadata.append(file_info)
+            return True
+        return False
+    def extract_text(self, file_data, file_ext):
+        """Extract text from different file types"""
+        try:
+            if file_ext.lower() == '.pdf':
+                reader = PyPDF2.PdfReader(BytesIO(file_data))
+                text = ""
+                for page in reader.pages:
+                    text += page.extract_text() + "\n"
+                return text
+            elif file_ext.lower() in ['.docx', '.doc']:
+                return docx2txt.process(BytesIO(file_data))
+            elif file_ext.lower() in ['.txt', '.csv', '.json']:
+                return file_data.decode('utf-8', errors='ignore')
+            else:
+                return ""
+        except Exception as e:
+            logger.error(f"Error extracting text: {e}")
+            return ""
+    def build_index(self):
+        """Build the search index"""
+        if not self.file_texts:
+            return False
+        try:
+            self.vectors = self.vectorizer.fit_transform(self.file_texts)
+            return True
+        except Exception as e:
+            logger.error(f"Error building search index: {e}")
+            return False
+    def search(self, query, top_k=5):
+        """Search the index for relevant files"""
+        if self.vectors is None:
+            return []
+        try:
+            query_vector = self.vectorizer.transform([query])
+            similarities = cosine_similarity(query_vector, self.vectors).flatten()
+            top_indices = similarities.argsort()[-top_k:][::-1]
+            results = []
+            for i, idx in enumerate(top_indices):
+                if similarities[idx] > 0:
+                    results.append({
+                        'file_info': self.file_metadata[idx],
+                        'score': float(similarities[idx]),
+                        'rank': i+1
+                    })
+            return results
+        except Exception as e:
+            logger.error(f"Error during search: {e}")
+            return []
 # -------------------- Utility Functions --------------------
 def get_random_user_agent():
     return random.choice(USER_AGENTS)
             logger.error(f"Error getting exam links: {e}")
             return []
+    async def discover_hidden_links(self, page):
+        """Discover hidden links that might be in JavaScript, iframes, or dynamic content"""
+        hidden_links = set()
+        # Execute JavaScript to find links in script tags and data attributes
+        js_links = await page.evaluate("""
+        () => {
+            const links = new Set();
+            // Extract URLs from script tags
+            const scripts = document.querySelectorAll('script');
+            for (const script of scripts) {
+                const content = script.textContent || '';
+                const urlMatches = content.match(/["'](https?:\/\/[^"']+)["']/g) || [];
+                for (let match of urlMatches) {
+                    links.add(match.replace(/["']/g, ''));
+                }
+            }
+            // Check for links in data attributes
+            const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link]');
+            for (const el of elements) {
+                for (const attr of ['data-url', 'data-href', 'data-src', 'data-link']) {
+                    const val = el.getAttribute(attr);
+                    if (val && val.match(/^https?:\/\//)) {
+                        links.add(val);
+                    }
+                }
+            }
+            // Look for URLs in inline event handlers
+            const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup]');
+            for (const el of clickableElements) {
+                for (const attr of ['onclick', 'onmousedown', 'onmouseup']) {
+                    const val = el.getAttribute(attr);
+                    if (val) {
+                        const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || [];
+                        for (let match of urlMatches) {
+                            links.add(match.replace(/["']/g, ''));
+                        }
+                    }
+                }
+            }
+            return Array.from(links);
+        }
+        """)
+        for link in js_links:
+            hidden_links.add(link)
+        # Extract links from iframes
+        iframes = await page.query_selector_all('iframe')
+        for iframe in iframes:
+            try:
+                frame = await iframe.content_frame()
+                if frame:
+                    iframe_links = await frame.evaluate("""
+                    () => {
+                        return Array.from(document.querySelectorAll('a[href]'))
+                            .map(a => a.href)
+                            .filter(href => href.startsWith('http'));
+                    }
+                    """)
+                    for link in iframe_links:
+                        hidden_links.add(link)
+            except Exception as e:
+                logger.warning(f"Could not extract links from iframe: {e}")
+        # Look for links in shadow DOM (used in modern web components)
+        shadow_links = await page.evaluate("""
+        () => {
+            const links = new Set();
+            // Helper function to recursively process shadow roots
+            function processShadowRoot(root) {
+                if (!root) return;
+                // Get links in this shadow root
+                const shadowLinks = root.querySelectorAll('a[href]');
+                for (const link of shadowLinks) {
+                    if (link.href && link.href.startsWith('http')) {
+                        links.add(link.href);
+                    }
+                }
+                // Process nested shadow roots
+                const elements = root.querySelectorAll('*');
+                for (const el of elements) {
+                    if (el.shadowRoot) {
+                        processShadowRoot(el.shadowRoot);
+                    }
+                }
+            }
+            // Find all shadow roots in the document
+            const elements = document.querySelectorAll('*');
+            for (const el of elements) {
+                if (el.shadowRoot) {
+                    processShadowRoot(el.shadowRoot);
+                }
+            }
+            return Array.from(links);
+        }
+        """)
+        for link in shadow_links:
+            hidden_links.add(link)
+        return hidden_links
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
                             'metadata': {}
                         })
+            # Check for hidden links that might be in JavaScript, iframes, or dynamic content
+            hidden_links = await self.discover_hidden_links(self.page)
+            for link in hidden_links:
+                if any(link.lower().endswith(ext) for ext in all_exts):
+                    found_files.append({
+                        'url': link,
+                        'filename': os.path.basename(link.split('?')[0]),
+                        'size': await self.get_file_size(link),
+                        'metadata': {}
+                    })
             # Deduplicate files by URL
             seen_urls = set()
             unique_files = []
         st.session_state.do_deep_search = False
         st.session_state.deep_search_url = None
         st.session_state.search_results = []
+        # For RAG search
+        st.session_state.rag_indexed = False
+        st.session_state.rag_engine = None
     with st.sidebar:
         mode = st.radio("Select Mode", ["Manual URL", "Bing Search"], key="mode_select")
         if st.session_state.discovered_files:
             files = st.session_state.discovered_files
+            # Display files with direct download buttons
+            download_dir = "./downloads"
+            os.makedirs(download_dir, exist_ok=True)
+            # Add RAG Search interface
+            st.markdown("### Search Within Discovered Files")
+            search_query = st.text_input("Enter search terms", key="rag_search_query")
+            if st.button("Search Files", key="rag_search_btn") and search_query:
+                # Initialize RAG search engine
+                if not st.session_state.rag_indexed:
+                    rag_search = RAGSearch()
+                    with st.spinner("Indexing files for search..."):
+                        # First download files to extract text
+                        temp_dir = "./temp_downloads"
+                        os.makedirs(temp_dir, exist_ok=True)
+                        async def download_for_indexing():
+                            downloaded = 0
+                            async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
+                                for i, file_info in enumerate(files):
+                                    # Only process common text-based file formats
+                                    ext = os.path.splitext(file_info['filename'])[1].lower()
+                                    if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json']:
+                                        path = await dm.download_file(file_info, temp_dir, url)
+                                        if path:
+                                            with open(path, 'rb') as f:
+                                                file_data = f.read()
+                                            # Add to search index
+                                            if rag_search.add_file(file_data, file_info):
+                                                downloaded += 1
+                                            # Clean up
+                                            os.remove(path)
+                            return downloaded
+                        indexed_count = asyncio.run(download_for_indexing())
+                        if indexed_count > 0:
+                            rag_search.build_index()
+                            st.session_state.rag_engine = rag_search
+                            st.session_state.rag_indexed = True
+                            st.success(f"Indexed {indexed_count} files for search")
+                        else:
+                            st.warning("Could not index any files. Try with more text-based documents.")
+                # Perform the search
+                if st.session_state.rag_indexed:
+                    search_results = st.session_state.rag_engine.search(search_query)
+                    if search_results:
+                        st.write(f"Found {len(search_results)} relevant files:")
+                        for result in search_results:
+                            file_info = result['file_info']
+                            score = result['score']
+                            with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"):
+                                st.write(f"Size: {file_info['size']}")
+                                if 'metadata' in file_info and file_info['metadata']:
+                                    st.write("Metadata:")
+                                    for k, v in file_info['metadata'].items():
+                                        if k != 'file_id':  # Skip technical details
+                                            st.write(f"- {k}: {v}")
+                                # Add direct download button
+                                if st.button(f"Download this file", key=f"rag_dl_{result['rank']}"):
+                                    with st.spinner(f"Downloading {file_info['filename']}..."):
+                                        async def download_search_result():
+                                            async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
+                                                path = await dm.download_file(file_info, download_dir, url)
+                                                return path
+                                        path = asyncio.run(download_search_result())
+                                        if path:
+                                            with open(path, "rb") as f:
+                                                file_data = f.read()
+                                            st.download_button(
+                                                label=f"Save {file_info['filename']}",
+                                                data=file_data,
+                                                file_name=file_info['filename'],
+                                                mime=mimetypes.guess_type(path)[0] or "application/octet-stream",
+                                                key=f"save_rag_{result['rank']}"
+                                            )
+                    else:
+                        st.warning("No matching files found for your query.")
+            # Show all files with direct download options
+            st.markdown("### All Discovered Files")
+            # Batch download options
             col1, col2 = st.columns([1, 4])
             with col1:
                 if st.button("Select All", key="select_all_btn"):
                 if st.button("Clear Selection", key="clear_selection_btn"):
                     st.session_state.selected_files = []
+            # Batch download settings
+            if 'selected_files' in st.session_state and st.session_state.selected_files:
+                batch_col1, batch_col2, batch_col3, batch_col4 = st.columns(4)
+                with batch_col1:
                     download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
+                with batch_col2:
                     create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
+                with batch_col3:
                     delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
+                with batch_col4:
                     upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
+                if st.button("Download Selected", key="download_batch_btn"):
                     if not os.path.exists(download_dir):
                         os.makedirs(download_dir)
                             proxy=proxy,
                             use_stealth=use_stealth
                         ) as dm:
+                            for i, idx in enumerate(st.session_state.selected_files):
+                                progress = (i + 1) / len(st.session_state.selected_files)
                                 file_info = files[idx]
+                                status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(st.session_state.selected_files)})")
                                 progress_bar.progress(progress)
                                 path = await dm.download_file(file_info, download_dir, url)
                                     except Exception as e:
                                         st.warning(f"Could not delete {path}: {e}")
                                 st.info("Deleted original files after ZIP creation")
+            # Individual file display with direct download buttons
+            for i, file in enumerate(files):
+                col1, col2, col3 = st.columns([3, 1, 1])
+                with col1:
+                    filename = file['filename']
+                    size = file['size']
+                    meta = file.get('metadata', {})
+                    file_info = f"{filename} ({size})"
+                    if meta and 'Pages' in meta:
+                        file_info += f" - {meta.get('Pages', '')} pages"
+                    st.markdown(f"**{i+1}. {file_info}**")
+                with col2:
+                    # Add direct download button for each file
+                    if st.button(f"Download", key=f"direct_dl_{i}"):
+                        with st.spinner(f"Downloading {filename}..."):
+                            async def download_single_file():
+                                async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
+                                    path = await dm.download_file(file, download_dir, url)
+                                    return path
+                            downloaded_path = asyncio.run(download_single_file())
+                            if downloaded_path:
+                                with open(downloaded_path, "rb") as f:
                                     file_data = f.read()
                                 st.download_button(
+                                    label=f"Save {filename}",
                                     data=file_data,
+                                    file_name=filename,
+                                    mime=mimetypes.guess_type(downloaded_path)[0] or "application/octet-stream",
+                                    key=f"save_file_{i}"
                                 )
+                with col3:
+                    # Add to selection for batch download
+                    if i in st.session_state.selected_files:
+                        if st.button("Unselect", key=f"unselect_{i}"):
+                            st.session_state.selected_files.remove(i)
+                    else:
+                        if st.button("Select", key=f"select_{i}"):
+                            st.session_state.selected_files.append(i)
     elif mode == "Bing Search":
         st.header("Bing Search Mode")
                     st.session_state.discovered_files = files
                     st.session_state.current_url = url
                     st.success(f"Found {len(files)} files!")
+                    # Show files with direct download options
+                    download_dir = "./downloads"
+                    os.makedirs(download_dir, exist_ok=True)
+                    for i, file in enumerate(files):
+                        col1, col2, col3 = st.columns([3, 1, 1])
+                        with col1:
+                            filename = file['filename']
+                            size = file['size']
+                            meta = file.get('metadata', {})
+                            file_info = f"{filename} ({size})"
+                            if meta and 'Pages' in meta:
+                                file_info += f" - {meta.get('Pages', '')} pages"
+                            st.markdown(f"**{i+1}. {file_info}**")
+                        with col2:
+                            # Add direct download button for each file
+                            if st.button(f"Download", key=f"direct_dl_{i}"):
+                                with st.spinner(f"Downloading {filename}..."):
+                                    async def download_single_file():
+                                        async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
+                                            path = await dm.download_file(file, download_dir, url)
+                                            return path
+                                    downloaded_path = asyncio.run(download_single_file())
+                                    if downloaded_path:
+                                        with open(downloaded_path, "rb") as f:
+                                            file_data = f.read()
+                                        st.download_button(
+                                            label=f"Save {filename}",
+                                            data=file_data,
+                                            file_name=filename,
+                                            mime=mimetypes.guess_type(downloaded_path)[0] or "application/octet-stream",
+                                            key=f"save_file_{i}"
+                                        )
+                        with col3:
+                            # Add to selection for batch download
+                            if i in st.session_state.selected_files:
+                                if st.button("Unselect", key=f"unselect_{i}"):
+                                    st.session_state.selected_files.remove(i)
+                            else:
+                                if st.button("Select", key=f"select_{i}"):
+                                    st.session_state.selected_files.append(i)
                 else:
                     st.warning("No files found.")
     # Add footer with attribution
     st.markdown('---')
+    st.markdown('Created by [Euler314](https://github.com/yu314coder)')
 if __name__ == "__main__":
     main()