Update app.py
Browse files
app.py
CHANGED
@@ -54,29 +54,56 @@ def load_models():
|
|
54 |
st.info("Downloading spaCy model...")
|
55 |
spacy.cli.download("en_core_web_sm")
|
56 |
nlp = spacy.load("en_core_web_sm")
|
57 |
-
|
58 |
-
# Load
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
return nlp, semantic_model, summarizer
|
|
|
66 |
except Exception as e:
|
67 |
st.error(f"Error loading models: {e}")
|
68 |
return None, None, None
|
69 |
|
70 |
-
# Initialize
|
71 |
-
with st.spinner("Setting up dependencies..."):
|
72 |
-
install_playwright_dependencies()
|
73 |
-
|
74 |
with st.spinner("Loading models..."):
|
75 |
nlp_model, semantic_model, summarizer = load_models()
|
76 |
|
77 |
-
if
|
78 |
-
st.error("Failed to load
|
79 |
st.stop()
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
# Rest of your imports and code here...
|
82 |
|
@@ -625,11 +652,16 @@ def main():
|
|
625 |
if files:
|
626 |
st.success(f"Found {len(files)} files!")
|
627 |
|
628 |
-
|
629 |
-
|
630 |
-
|
|
|
|
|
|
|
|
|
631 |
|
632 |
# Download section
|
|
|
633 |
selected_files = st.multiselect(
|
634 |
"Select files to download",
|
635 |
range(len(files)),
|
@@ -637,13 +669,22 @@ def main():
|
|
637 |
)
|
638 |
|
639 |
if selected_files:
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
647 |
path = await dm.download_file(
|
648 |
files[idx],
|
649 |
download_dir,
|
@@ -651,11 +692,21 @@ def main():
|
|
651 |
)
|
652 |
if path:
|
653 |
paths.append(path)
|
654 |
-
|
|
|
|
|
|
|
655 |
|
656 |
-
|
657 |
-
|
658 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
659 |
else:
|
660 |
st.warning("No files found.")
|
661 |
|
@@ -689,9 +740,13 @@ def main():
|
|
689 |
st.session_state.current_url = url
|
690 |
st.success(f"Found {len(files)} files!")
|
691 |
|
692 |
-
|
693 |
-
|
694 |
-
|
|
|
|
|
|
|
|
|
695 |
|
696 |
selected_files = st.multiselect(
|
697 |
"Select files to download",
|
@@ -700,11 +755,20 @@ def main():
|
|
700 |
)
|
701 |
|
702 |
if selected_files:
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
708 |
path = await dm.download_file(
|
709 |
files[idx],
|
710 |
download_dir,
|
@@ -712,8 +776,18 @@ def main():
|
|
712 |
)
|
713 |
if path:
|
714 |
paths.append(path)
|
715 |
-
|
716 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
717 |
else:
|
718 |
st.warning("No files found on this page.")
|
719 |
else:
|
@@ -722,15 +796,18 @@ def main():
|
|
722 |
asyncio.run(run_search())
|
723 |
|
724 |
else: # PDF Summarizer mode
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
730 |
-
|
731 |
-
|
732 |
-
|
733 |
-
st.
|
|
|
|
|
|
|
734 |
|
735 |
if __name__ == "__main__":
|
736 |
try:
|
|
|
54 |
st.info("Downloading spaCy model...")
|
55 |
spacy.cli.download("en_core_web_sm")
|
56 |
nlp = spacy.load("en_core_web_sm")
|
57 |
+
|
58 |
+
# Load SentenceTransformer with offline handling
|
59 |
+
try:
|
60 |
+
from sentence_transformers import SentenceTransformer
|
61 |
+
model_name = 'all-MiniLM-L6-v2'
|
62 |
+
cache_dir = os.path.expanduser('~/.cache/torch/sentence_transformers')
|
63 |
+
if os.path.exists(os.path.join(cache_dir, model_name)):
|
64 |
+
semantic_model = SentenceTransformer(os.path.join(cache_dir, model_name))
|
65 |
+
else:
|
66 |
+
st.warning(f"Downloading SentenceTransformer model {model_name}...")
|
67 |
+
semantic_model = SentenceTransformer(model_name)
|
68 |
+
except Exception as e:
|
69 |
+
st.error(f"Error loading SentenceTransformer: {e}")
|
70 |
+
st.info("Continuing without semantic search capability...")
|
71 |
+
semantic_model = None
|
72 |
+
|
73 |
+
# Load Transformers pipeline with offline handling
|
74 |
+
try:
|
75 |
+
from transformers import pipeline, AutoModelForSeq2SeqGeneration, AutoTokenizer
|
76 |
+
model_name = "facebook/bart-large-cnn"
|
77 |
+
cache_dir = os.path.expanduser('~/.cache/huggingface/transformers')
|
78 |
+
if os.path.exists(os.path.join(cache_dir, model_name)):
|
79 |
+
summarizer = pipeline("summarization", model=model_name)
|
80 |
+
else:
|
81 |
+
st.warning(f"Downloading Transformer model {model_name}...")
|
82 |
+
summarizer = pipeline("summarization")
|
83 |
+
except Exception as e:
|
84 |
+
st.error(f"Error loading Transformers: {e}")
|
85 |
+
st.info("Continuing without summarization capability...")
|
86 |
+
summarizer = None
|
87 |
+
|
88 |
return nlp, semantic_model, summarizer
|
89 |
+
|
90 |
except Exception as e:
|
91 |
st.error(f"Error loading models: {e}")
|
92 |
return None, None, None
|
93 |
|
94 |
+
# Initialize models with better error handling
|
|
|
|
|
|
|
95 |
with st.spinner("Loading models..."):
|
96 |
nlp_model, semantic_model, summarizer = load_models()
|
97 |
|
98 |
+
if nlp_model is None:
|
99 |
+
st.error("Failed to load essential NLP model. The application cannot continue.")
|
100 |
st.stop()
|
101 |
+
else:
|
102 |
+
# Continue with available features based on which models loaded successfully
|
103 |
+
if semantic_model is None:
|
104 |
+
st.warning("Semantic search features will be disabled.")
|
105 |
+
if summarizer is None:
|
106 |
+
st.warning("PDF summarization features will be disabled.")
|
107 |
|
108 |
# Rest of your imports and code here...
|
109 |
|
|
|
652 |
if files:
|
653 |
st.success(f"Found {len(files)} files!")
|
654 |
|
655 |
+
with st.expander("Found Files", expanded=True):
|
656 |
+
for i, file in enumerate(files):
|
657 |
+
col1, col2 = st.columns([3, 1])
|
658 |
+
with col1:
|
659 |
+
st.write(f"{i+1}. {file['filename']}")
|
660 |
+
with col2:
|
661 |
+
st.write(f"Size: {file['size']}")
|
662 |
|
663 |
# Download section
|
664 |
+
st.subheader("Download Files")
|
665 |
selected_files = st.multiselect(
|
666 |
"Select files to download",
|
667 |
range(len(files)),
|
|
|
669 |
)
|
670 |
|
671 |
if selected_files:
|
672 |
+
col1, col2 = st.columns([3, 1])
|
673 |
+
with col1:
|
674 |
+
download_dir = st.text_input("Download Directory", value="./downloads")
|
675 |
+
with col2:
|
676 |
+
if st.button("Download Selected", use_container_width=True):
|
677 |
+
async def download_files():
|
678 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
679 |
+
paths = []
|
680 |
+
progress_text = st.empty()
|
681 |
+
progress_bar = st.progress(0)
|
682 |
+
|
683 |
+
for i, idx in enumerate(selected_files):
|
684 |
+
progress = (i + 1) / len(selected_files)
|
685 |
+
progress_text.text(f"Downloading {files[idx]['filename']}...")
|
686 |
+
progress_bar.progress(progress)
|
687 |
+
|
688 |
path = await dm.download_file(
|
689 |
files[idx],
|
690 |
download_dir,
|
|
|
692 |
)
|
693 |
if path:
|
694 |
paths.append(path)
|
695 |
+
|
696 |
+
progress_text.empty()
|
697 |
+
progress_bar.empty()
|
698 |
+
return paths
|
699 |
|
700 |
+
downloaded = asyncio.run(download_files())
|
701 |
+
if downloaded:
|
702 |
+
st.success(f"Successfully downloaded {len(downloaded)} files to {download_dir}")
|
703 |
+
# Create zip file if multiple files were downloaded
|
704 |
+
if len(downloaded) > 1:
|
705 |
+
zip_path = os.path.join(download_dir, "downloads.zip")
|
706 |
+
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
707 |
+
for file in downloaded:
|
708 |
+
zipf.write(file, os.path.basename(file))
|
709 |
+
st.success(f"Created zip file: {zip_path}")
|
710 |
else:
|
711 |
st.warning("No files found.")
|
712 |
|
|
|
740 |
st.session_state.current_url = url
|
741 |
st.success(f"Found {len(files)} files!")
|
742 |
|
743 |
+
with st.expander("Found Files", expanded=True):
|
744 |
+
for j, file in enumerate(files):
|
745 |
+
col1, col2 = st.columns([3, 1])
|
746 |
+
with col1:
|
747 |
+
st.write(f"{j+1}. {file['filename']}")
|
748 |
+
with col2:
|
749 |
+
st.write(f"Size: {file['size']}")
|
750 |
|
751 |
selected_files = st.multiselect(
|
752 |
"Select files to download",
|
|
|
755 |
)
|
756 |
|
757 |
if selected_files:
|
758 |
+
col1, col2 = st.columns([3, 1])
|
759 |
+
with col1:
|
760 |
+
download_dir = st.text_input("Download Directory", value="./downloads")
|
761 |
+
with col2:
|
762 |
+
if st.button("Download Selected Files"):
|
763 |
+
progress_text = st.empty()
|
764 |
+
progress_bar = st.progress(0)
|
765 |
+
|
766 |
+
paths = []
|
767 |
+
for k, idx in enumerate(selected_files):
|
768 |
+
progress = (k + 1) / len(selected_files)
|
769 |
+
progress_text.text(f"Downloading {files[idx]['filename']}...")
|
770 |
+
progress_bar.progress(progress)
|
771 |
+
|
772 |
path = await dm.download_file(
|
773 |
files[idx],
|
774 |
download_dir,
|
|
|
776 |
)
|
777 |
if path:
|
778 |
paths.append(path)
|
779 |
+
|
780 |
+
progress_text.empty()
|
781 |
+
progress_bar.empty()
|
782 |
+
|
783 |
+
if paths:
|
784 |
+
st.success(f"Successfully downloaded {len(paths)} files to {download_dir}")
|
785 |
+
if len(paths) > 1:
|
786 |
+
zip_path = os.path.join(download_dir, "downloads.zip")
|
787 |
+
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
788 |
+
for file in paths:
|
789 |
+
zipf.write(file, os.path.basename(file))
|
790 |
+
st.success(f"Created zip file: {zip_path}")
|
791 |
else:
|
792 |
st.warning("No files found on this page.")
|
793 |
else:
|
|
|
796 |
asyncio.run(run_search())
|
797 |
|
798 |
else: # PDF Summarizer mode
|
799 |
+
if summarizer is None:
|
800 |
+
st.error("PDF summarization is not available due to model loading errors.")
|
801 |
+
else:
|
802 |
+
st.header("PDF Summarizer")
|
803 |
+
pdf_url = st.text_input("Enter PDF URL")
|
804 |
+
|
805 |
+
if st.button("Summarize"):
|
806 |
+
if pdf_url:
|
807 |
+
with st.spinner("Generating summary..."):
|
808 |
+
summary = summarize_pdf_url(pdf_url)
|
809 |
+
st.write("Summary:")
|
810 |
+
st.write(summary)
|
811 |
|
812 |
if __name__ == "__main__":
|
813 |
try:
|