Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Feb 15

Commit

2ee6b58

verified ·

1 Parent(s): 6b8a747

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -47

app.py CHANGED Viewed

@@ -54,29 +54,56 @@ def load_models():
             st.info("Downloading spaCy model...")
             spacy.cli.download("en_core_web_sm")
             nlp = spacy.load("en_core_web_sm")
-        # Load other models
-        from sentence_transformers import SentenceTransformer
-        semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
-        from transformers import pipeline
-        summarizer = pipeline("summarization")
         return nlp, semantic_model, summarizer
     except Exception as e:
         st.error(f"Error loading models: {e}")
         return None, None, None
-# Initialize dependencies and models
-with st.spinner("Setting up dependencies..."):
-    install_playwright_dependencies()
 with st.spinner("Loading models..."):
     nlp_model, semantic_model, summarizer = load_models()
-if not all([nlp_model, semantic_model, summarizer]):
-    st.error("Failed to load required models. Please check the error messages above.")
     st.stop()
 # Rest of your imports and code here...
@@ -625,11 +652,16 @@ def main():
                 if files:
                     st.success(f"Found {len(files)} files!")
-                    # Display files
-                    for file in files:
-                        st.write(f"- {file['filename']} ({file['size']})")
                     # Download section
                     selected_files = st.multiselect(
                         "Select files to download",
                         range(len(files)),
@@ -637,13 +669,22 @@ def main():
                     )
                     if selected_files:
-                        download_dir = st.text_input("Download Directory", value="./downloads")
-                        if st.button("Download Selected"):
-                            async def download_files():
-                                async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
-                                    paths = []
-                                    for idx in selected_files:
-                                        with st.spinner(f"Downloading {files[idx]['filename']}..."):
                                             path = await dm.download_file(
                                                 files[idx],
                                                 download_dir,
@@ -651,11 +692,21 @@ def main():
                                             )
                                             if path:
                                                 paths.append(path)
-                                    return paths
-                            downloaded = asyncio.run(download_files())
-                            if downloaded:
-                                st.success(f"Successfully downloaded {len(downloaded)} files to {download_dir}")
                 else:
                     st.warning("No files found.")
@@ -689,9 +740,13 @@ def main():
                                                 st.session_state.current_url = url
                                                 st.success(f"Found {len(files)} files!")
-                                                # Display and download section
-                                                for file in files:
-                                                    st.write(f"- {file['filename']} ({file['size']})")
                                                 selected_files = st.multiselect(
                                                     "Select files to download",
@@ -700,11 +755,20 @@ def main():
                                                 )
                                                 if selected_files:
-                                                    download_dir = st.text_input("Download Directory", value="./downloads")
-                                                    if st.button("Download Selected Files"):
-                                                        paths = []
-                                                        for idx in selected_files:
-                                                            with st.spinner(f"Downloading {files[idx]['filename']}..."):
                                                                 path = await dm.download_file(
                                                                     files[idx],
                                                                     download_dir,
@@ -712,8 +776,18 @@ def main():
                                                                 )
                                                                 if path:
                                                                     paths.append(path)
-                                                        if paths:
-                                                            st.success(f"Successfully downloaded {len(paths)} files to {download_dir}")
                                             else:
                                                 st.warning("No files found on this page.")
                             else:
@@ -722,15 +796,18 @@ def main():
                 asyncio.run(run_search())
     else:  # PDF Summarizer mode
-        st.header("PDF Summarizer")
-        pdf_url = st.text_input("Enter PDF URL")
-        if st.button("Summarize"):
-            if pdf_url:
-                with st.spinner("Generating summary..."):
-                    summary = summarize_pdf_url(pdf_url)
-                    st.write("Summary:")
-                    st.write(summary)
 if __name__ == "__main__":
     try:

             st.info("Downloading spaCy model...")
             spacy.cli.download("en_core_web_sm")
             nlp = spacy.load("en_core_web_sm")
+        # Load SentenceTransformer with offline handling
+        try:
+            from sentence_transformers import SentenceTransformer
+            model_name = 'all-MiniLM-L6-v2'
+            cache_dir = os.path.expanduser('~/.cache/torch/sentence_transformers')
+            if os.path.exists(os.path.join(cache_dir, model_name)):
+                semantic_model = SentenceTransformer(os.path.join(cache_dir, model_name))
+            else:
+                st.warning(f"Downloading SentenceTransformer model {model_name}...")
+                semantic_model = SentenceTransformer(model_name)
+        except Exception as e:
+            st.error(f"Error loading SentenceTransformer: {e}")
+            st.info("Continuing without semantic search capability...")
+            semantic_model = None
+        # Load Transformers pipeline with offline handling
+        try:
+            from transformers import pipeline, AutoModelForSeq2SeqGeneration, AutoTokenizer
+            model_name = "facebook/bart-large-cnn"
+            cache_dir = os.path.expanduser('~/.cache/huggingface/transformers')
+            if os.path.exists(os.path.join(cache_dir, model_name)):
+                summarizer = pipeline("summarization", model=model_name)
+            else:
+                st.warning(f"Downloading Transformer model {model_name}...")
+                summarizer = pipeline("summarization")
+        except Exception as e:
+            st.error(f"Error loading Transformers: {e}")
+            st.info("Continuing without summarization capability...")
+            summarizer = None
         return nlp, semantic_model, summarizer
     except Exception as e:
         st.error(f"Error loading models: {e}")
         return None, None, None
+# Initialize models with better error handling
 with st.spinner("Loading models..."):
     nlp_model, semantic_model, summarizer = load_models()
+if nlp_model is None:
+    st.error("Failed to load essential NLP model. The application cannot continue.")
     st.stop()
+else:
+    # Continue with available features based on which models loaded successfully
+    if semantic_model is None:
+        st.warning("Semantic search features will be disabled.")
+    if summarizer is None:
+        st.warning("PDF summarization features will be disabled.")
 # Rest of your imports and code here...
                 if files:
                     st.success(f"Found {len(files)} files!")
+                    with st.expander("Found Files", expanded=True):
+                        for i, file in enumerate(files):
+                            col1, col2 = st.columns([3, 1])
+                            with col1:
+                                st.write(f"{i+1}. {file['filename']}")
+                            with col2:
+                                st.write(f"Size: {file['size']}")
                     # Download section
+                    st.subheader("Download Files")
                     selected_files = st.multiselect(
                         "Select files to download",
                         range(len(files)),
                     )
                     if selected_files:
+                        col1, col2 = st.columns([3, 1])
+                        with col1:
+                            download_dir = st.text_input("Download Directory", value="./downloads")
+                        with col2:
+                            if st.button("Download Selected", use_container_width=True):
+                                async def download_files():
+                                    async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
+                                        paths = []
+                                        progress_text = st.empty()
+                                        progress_bar = st.progress(0)
+                                        for i, idx in enumerate(selected_files):
+                                            progress = (i + 1) / len(selected_files)
+                                            progress_text.text(f"Downloading {files[idx]['filename']}...")
+                                            progress_bar.progress(progress)
                                             path = await dm.download_file(
                                                 files[idx],
                                                 download_dir,
                                             )
                                             if path:
                                                 paths.append(path)
+                                        progress_text.empty()
+                                        progress_bar.empty()
+                                        return paths
+                                downloaded = asyncio.run(download_files())
+                                if downloaded:
+                                    st.success(f"Successfully downloaded {len(downloaded)} files to {download_dir}")
+                                    # Create zip file if multiple files were downloaded
+                                    if len(downloaded) > 1:
+                                        zip_path = os.path.join(download_dir, "downloads.zip")
+                                        with zipfile.ZipFile(zip_path, 'w') as zipf:
+                                            for file in downloaded:
+                                                zipf.write(file, os.path.basename(file))
+                                        st.success(f"Created zip file: {zip_path}")
                 else:
                     st.warning("No files found.")
                                                 st.session_state.current_url = url
                                                 st.success(f"Found {len(files)} files!")
+                                                with st.expander("Found Files", expanded=True):
+                                                    for j, file in enumerate(files):
+                                                        col1, col2 = st.columns([3, 1])
+                                                        with col1:
+                                                            st.write(f"{j+1}. {file['filename']}")
+                                                        with col2:
+                                                            st.write(f"Size: {file['size']}")
                                                 selected_files = st.multiselect(
                                                     "Select files to download",
                                                 )
                                                 if selected_files:
+                                                    col1, col2 = st.columns([3, 1])
+                                                    with col1:
+                                                        download_dir = st.text_input("Download Directory", value="./downloads")
+                                                    with col2:
+                                                        if st.button("Download Selected Files"):
+                                                            progress_text = st.empty()
+                                                            progress_bar = st.progress(0)
+                                                            paths = []
+                                                            for k, idx in enumerate(selected_files):
+                                                                progress = (k + 1) / len(selected_files)
+                                                                progress_text.text(f"Downloading {files[idx]['filename']}...")
+                                                                progress_bar.progress(progress)
                                                                 path = await dm.download_file(
                                                                     files[idx],
                                                                     download_dir,
                                                                 )
                                                                 if path:
                                                                     paths.append(path)
+                                                            progress_text.empty()
+                                                            progress_bar.empty()
+                                                            if paths:
+                                                                st.success(f"Successfully downloaded {len(paths)} files to {download_dir}")
+                                                                if len(paths) > 1:
+                                                                    zip_path = os.path.join(download_dir, "downloads.zip")
+                                                                    with zipfile.ZipFile(zip_path, 'w') as zipf:
+                                                                        for file in paths:
+                                                                            zipf.write(file, os.path.basename(file))
+                                                                    st.success(f"Created zip file: {zip_path}")
                                             else:
                                                 st.warning("No files found on this page.")
                             else:
                 asyncio.run(run_search())
     else:  # PDF Summarizer mode
+        if summarizer is None:
+            st.error("PDF summarization is not available due to model loading errors.")
+        else:
+            st.header("PDF Summarizer")
+            pdf_url = st.text_input("Enter PDF URL")
+            if st.button("Summarize"):
+                if pdf_url:
+                    with st.spinner("Generating summary..."):
+                        summary = summarize_pdf_url(pdf_url)
+                        st.write("Summary:")
+                        st.write(summary)
 if __name__ == "__main__":
     try: