Update app.py
Browse files
app.py
CHANGED
@@ -32,6 +32,14 @@ import googleapiclient.discovery
|
|
32 |
import google.auth.transport.requests
|
33 |
import googleapiclient.http
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
# -------------------- Logging Setup --------------------
|
36 |
logging.basicConfig(
|
37 |
level=logging.INFO,
|
@@ -88,6 +96,77 @@ PROXY_ROTATION_CONFIG = {
|
|
88 |
"proxies": [] # Will be populated from the UI if needed
|
89 |
}
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
# -------------------- Utility Functions --------------------
|
92 |
def get_random_user_agent():
|
93 |
return random.choice(USER_AGENTS)
|
@@ -857,6 +936,118 @@ class DownloadManager:
|
|
857 |
logger.error(f"Error getting exam links: {e}")
|
858 |
return []
|
859 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
860 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
861 |
found_files = []
|
862 |
try:
|
@@ -1176,6 +1367,17 @@ class DownloadManager:
|
|
1176 |
'metadata': {}
|
1177 |
})
|
1178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1179 |
# Deduplicate files by URL
|
1180 |
seen_urls = set()
|
1181 |
unique_files = []
|
@@ -2725,6 +2927,9 @@ def main():
|
|
2725 |
st.session_state.do_deep_search = False
|
2726 |
st.session_state.deep_search_url = None
|
2727 |
st.session_state.search_results = []
|
|
|
|
|
|
|
2728 |
|
2729 |
with st.sidebar:
|
2730 |
mode = st.radio("Select Mode", ["Manual URL", "Bing Search"], key="mode_select")
|
@@ -2814,6 +3019,99 @@ def main():
|
|
2814 |
|
2815 |
if st.session_state.discovered_files:
|
2816 |
files = st.session_state.discovered_files
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2817 |
col1, col2 = st.columns([1, 4])
|
2818 |
with col1:
|
2819 |
if st.button("Select All", key="select_all_btn"):
|
@@ -2821,43 +3119,19 @@ def main():
|
|
2821 |
if st.button("Clear Selection", key="clear_selection_btn"):
|
2822 |
st.session_state.selected_files = []
|
2823 |
|
2824 |
-
#
|
2825 |
-
|
2826 |
-
|
2827 |
-
|
2828 |
-
size = file['size']
|
2829 |
-
meta = file.get('metadata', {})
|
2830 |
-
|
2831 |
-
# Format display string with relevant metadata
|
2832 |
-
if meta and 'Pages' in meta:
|
2833 |
-
file_info = f"{filename} ({size}) - {meta.get('Pages', '')} pages"
|
2834 |
-
else:
|
2835 |
-
file_info = f"{filename} ({size})"
|
2836 |
-
|
2837 |
-
file_options.append((i, file_info))
|
2838 |
-
|
2839 |
-
selected_indices = st.multiselect(
|
2840 |
-
"Select files to download",
|
2841 |
-
options=[i for i, _ in file_options],
|
2842 |
-
default=st.session_state.selected_files,
|
2843 |
-
format_func=lambda i: next(info for idx, info in file_options if idx == i),
|
2844 |
-
key="file_multiselect"
|
2845 |
-
)
|
2846 |
-
|
2847 |
-
st.session_state.selected_files = selected_indices
|
2848 |
-
|
2849 |
-
if selected_indices:
|
2850 |
-
col1, col2, col3, col4 = st.columns(4)
|
2851 |
-
with col1:
|
2852 |
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
|
2853 |
-
with
|
2854 |
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
|
2855 |
-
with
|
2856 |
delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
|
2857 |
-
with
|
2858 |
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
|
2859 |
|
2860 |
-
if st.button("Download Selected", key="
|
2861 |
if not os.path.exists(download_dir):
|
2862 |
os.makedirs(download_dir)
|
2863 |
|
@@ -2871,10 +3145,10 @@ def main():
|
|
2871 |
proxy=proxy,
|
2872 |
use_stealth=use_stealth
|
2873 |
) as dm:
|
2874 |
-
for i, idx in enumerate(
|
2875 |
-
progress = (i + 1) / len(
|
2876 |
file_info = files[idx]
|
2877 |
-
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(
|
2878 |
progress_bar.progress(progress)
|
2879 |
|
2880 |
path = await dm.download_file(file_info, download_dir, url)
|
@@ -2925,23 +3199,49 @@ def main():
|
|
2925 |
except Exception as e:
|
2926 |
st.warning(f"Could not delete {path}: {e}")
|
2927 |
st.info("Deleted original files after ZIP creation")
|
2928 |
-
|
2929 |
-
|
2930 |
-
|
2931 |
-
|
2932 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2933 |
file_data = f.read()
|
2934 |
|
2935 |
-
file_name = os.path.basename(path)
|
2936 |
-
mime_type = mimetypes.guess_type(path)[0] or "application/octet-stream"
|
2937 |
-
|
2938 |
st.download_button(
|
2939 |
-
label=f"
|
2940 |
data=file_data,
|
2941 |
-
file_name=
|
2942 |
-
mime=
|
2943 |
-
key=f"
|
2944 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2945 |
|
2946 |
elif mode == "Bing Search":
|
2947 |
st.header("Bing Search Mode")
|
@@ -3007,6 +3307,52 @@ def main():
|
|
3007 |
st.session_state.discovered_files = files
|
3008 |
st.session_state.current_url = url
|
3009 |
st.success(f"Found {len(files)} files!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3010 |
else:
|
3011 |
st.warning("No files found.")
|
3012 |
|
@@ -3054,7 +3400,7 @@ def main():
|
|
3054 |
|
3055 |
# Add footer with attribution
|
3056 |
st.markdown('---')
|
3057 |
-
st.markdown('Created by [Euler314](https://github.com/
|
3058 |
|
3059 |
if __name__ == "__main__":
|
3060 |
main()
|
|
|
32 |
import google.auth.transport.requests
|
33 |
import googleapiclient.http
|
34 |
|
35 |
+
# New imports for RAG search
|
36 |
+
import nltk
|
37 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
38 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
39 |
+
import numpy as np
|
40 |
+
import docx2txt
|
41 |
+
import PyPDF2
|
42 |
+
|
43 |
# -------------------- Logging Setup --------------------
|
44 |
logging.basicConfig(
|
45 |
level=logging.INFO,
|
|
|
96 |
"proxies": [] # Will be populated from the UI if needed
|
97 |
}
|
98 |
|
99 |
+
# -------------------- RAG Search Class --------------------
|
100 |
+
class RAGSearch:
|
101 |
+
def __init__(self):
|
102 |
+
self.file_texts = []
|
103 |
+
self.file_metadata = []
|
104 |
+
self.vectorizer = TfidfVectorizer(stop_words='english')
|
105 |
+
self.vectors = None
|
106 |
+
|
107 |
+
def add_file(self, file_data, file_info):
|
108 |
+
"""Add a file to the search index"""
|
109 |
+
file_ext = os.path.splitext(file_info['filename'])[1]
|
110 |
+
text = self.extract_text(file_data, file_ext)
|
111 |
+
if text:
|
112 |
+
self.file_texts.append(text)
|
113 |
+
self.file_metadata.append(file_info)
|
114 |
+
return True
|
115 |
+
return False
|
116 |
+
|
117 |
+
def extract_text(self, file_data, file_ext):
|
118 |
+
"""Extract text from different file types"""
|
119 |
+
try:
|
120 |
+
if file_ext.lower() == '.pdf':
|
121 |
+
reader = PyPDF2.PdfReader(BytesIO(file_data))
|
122 |
+
text = ""
|
123 |
+
for page in reader.pages:
|
124 |
+
text += page.extract_text() + "\n"
|
125 |
+
return text
|
126 |
+
elif file_ext.lower() in ['.docx', '.doc']:
|
127 |
+
return docx2txt.process(BytesIO(file_data))
|
128 |
+
elif file_ext.lower() in ['.txt', '.csv', '.json']:
|
129 |
+
return file_data.decode('utf-8', errors='ignore')
|
130 |
+
else:
|
131 |
+
return ""
|
132 |
+
except Exception as e:
|
133 |
+
logger.error(f"Error extracting text: {e}")
|
134 |
+
return ""
|
135 |
+
|
136 |
+
def build_index(self):
|
137 |
+
"""Build the search index"""
|
138 |
+
if not self.file_texts:
|
139 |
+
return False
|
140 |
+
try:
|
141 |
+
self.vectors = self.vectorizer.fit_transform(self.file_texts)
|
142 |
+
return True
|
143 |
+
except Exception as e:
|
144 |
+
logger.error(f"Error building search index: {e}")
|
145 |
+
return False
|
146 |
+
|
147 |
+
def search(self, query, top_k=5):
|
148 |
+
"""Search the index for relevant files"""
|
149 |
+
if self.vectors is None:
|
150 |
+
return []
|
151 |
+
|
152 |
+
try:
|
153 |
+
query_vector = self.vectorizer.transform([query])
|
154 |
+
similarities = cosine_similarity(query_vector, self.vectors).flatten()
|
155 |
+
top_indices = similarities.argsort()[-top_k:][::-1]
|
156 |
+
|
157 |
+
results = []
|
158 |
+
for i, idx in enumerate(top_indices):
|
159 |
+
if similarities[idx] > 0:
|
160 |
+
results.append({
|
161 |
+
'file_info': self.file_metadata[idx],
|
162 |
+
'score': float(similarities[idx]),
|
163 |
+
'rank': i+1
|
164 |
+
})
|
165 |
+
return results
|
166 |
+
except Exception as e:
|
167 |
+
logger.error(f"Error during search: {e}")
|
168 |
+
return []
|
169 |
+
|
170 |
# -------------------- Utility Functions --------------------
|
171 |
def get_random_user_agent():
|
172 |
return random.choice(USER_AGENTS)
|
|
|
936 |
logger.error(f"Error getting exam links: {e}")
|
937 |
return []
|
938 |
|
939 |
+
async def discover_hidden_links(self, page):
|
940 |
+
"""Discover hidden links that might be in JavaScript, iframes, or dynamic content"""
|
941 |
+
hidden_links = set()
|
942 |
+
|
943 |
+
# Execute JavaScript to find links in script tags and data attributes
|
944 |
+
js_links = await page.evaluate("""
|
945 |
+
() => {
|
946 |
+
const links = new Set();
|
947 |
+
|
948 |
+
// Extract URLs from script tags
|
949 |
+
const scripts = document.querySelectorAll('script');
|
950 |
+
for (const script of scripts) {
|
951 |
+
const content = script.textContent || '';
|
952 |
+
const urlMatches = content.match(/["'](https?:\/\/[^"']+)["']/g) || [];
|
953 |
+
for (let match of urlMatches) {
|
954 |
+
links.add(match.replace(/["']/g, ''));
|
955 |
+
}
|
956 |
+
}
|
957 |
+
|
958 |
+
// Check for links in data attributes
|
959 |
+
const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link]');
|
960 |
+
for (const el of elements) {
|
961 |
+
for (const attr of ['data-url', 'data-href', 'data-src', 'data-link']) {
|
962 |
+
const val = el.getAttribute(attr);
|
963 |
+
if (val && val.match(/^https?:\/\//)) {
|
964 |
+
links.add(val);
|
965 |
+
}
|
966 |
+
}
|
967 |
+
}
|
968 |
+
|
969 |
+
// Look for URLs in inline event handlers
|
970 |
+
const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup]');
|
971 |
+
for (const el of clickableElements) {
|
972 |
+
for (const attr of ['onclick', 'onmousedown', 'onmouseup']) {
|
973 |
+
const val = el.getAttribute(attr);
|
974 |
+
if (val) {
|
975 |
+
const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || [];
|
976 |
+
for (let match of urlMatches) {
|
977 |
+
links.add(match.replace(/["']/g, ''));
|
978 |
+
}
|
979 |
+
}
|
980 |
+
}
|
981 |
+
}
|
982 |
+
|
983 |
+
return Array.from(links);
|
984 |
+
}
|
985 |
+
""")
|
986 |
+
|
987 |
+
for link in js_links:
|
988 |
+
hidden_links.add(link)
|
989 |
+
|
990 |
+
# Extract links from iframes
|
991 |
+
iframes = await page.query_selector_all('iframe')
|
992 |
+
for iframe in iframes:
|
993 |
+
try:
|
994 |
+
frame = await iframe.content_frame()
|
995 |
+
if frame:
|
996 |
+
iframe_links = await frame.evaluate("""
|
997 |
+
() => {
|
998 |
+
return Array.from(document.querySelectorAll('a[href]'))
|
999 |
+
.map(a => a.href)
|
1000 |
+
.filter(href => href.startsWith('http'));
|
1001 |
+
}
|
1002 |
+
""")
|
1003 |
+
for link in iframe_links:
|
1004 |
+
hidden_links.add(link)
|
1005 |
+
except Exception as e:
|
1006 |
+
logger.warning(f"Could not extract links from iframe: {e}")
|
1007 |
+
|
1008 |
+
# Look for links in shadow DOM (used in modern web components)
|
1009 |
+
shadow_links = await page.evaluate("""
|
1010 |
+
() => {
|
1011 |
+
const links = new Set();
|
1012 |
+
|
1013 |
+
// Helper function to recursively process shadow roots
|
1014 |
+
function processShadowRoot(root) {
|
1015 |
+
if (!root) return;
|
1016 |
+
|
1017 |
+
// Get links in this shadow root
|
1018 |
+
const shadowLinks = root.querySelectorAll('a[href]');
|
1019 |
+
for (const link of shadowLinks) {
|
1020 |
+
if (link.href && link.href.startsWith('http')) {
|
1021 |
+
links.add(link.href);
|
1022 |
+
}
|
1023 |
+
}
|
1024 |
+
|
1025 |
+
// Process nested shadow roots
|
1026 |
+
const elements = root.querySelectorAll('*');
|
1027 |
+
for (const el of elements) {
|
1028 |
+
if (el.shadowRoot) {
|
1029 |
+
processShadowRoot(el.shadowRoot);
|
1030 |
+
}
|
1031 |
+
}
|
1032 |
+
}
|
1033 |
+
|
1034 |
+
// Find all shadow roots in the document
|
1035 |
+
const elements = document.querySelectorAll('*');
|
1036 |
+
for (const el of elements) {
|
1037 |
+
if (el.shadowRoot) {
|
1038 |
+
processShadowRoot(el.shadowRoot);
|
1039 |
+
}
|
1040 |
+
}
|
1041 |
+
|
1042 |
+
return Array.from(links);
|
1043 |
+
}
|
1044 |
+
""")
|
1045 |
+
|
1046 |
+
for link in shadow_links:
|
1047 |
+
hidden_links.add(link)
|
1048 |
+
|
1049 |
+
return hidden_links
|
1050 |
+
|
1051 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
1052 |
found_files = []
|
1053 |
try:
|
|
|
1367 |
'metadata': {}
|
1368 |
})
|
1369 |
|
1370 |
+
# Check for hidden links that might be in JavaScript, iframes, or dynamic content
|
1371 |
+
hidden_links = await self.discover_hidden_links(self.page)
|
1372 |
+
for link in hidden_links:
|
1373 |
+
if any(link.lower().endswith(ext) for ext in all_exts):
|
1374 |
+
found_files.append({
|
1375 |
+
'url': link,
|
1376 |
+
'filename': os.path.basename(link.split('?')[0]),
|
1377 |
+
'size': await self.get_file_size(link),
|
1378 |
+
'metadata': {}
|
1379 |
+
})
|
1380 |
+
|
1381 |
# Deduplicate files by URL
|
1382 |
seen_urls = set()
|
1383 |
unique_files = []
|
|
|
2927 |
st.session_state.do_deep_search = False
|
2928 |
st.session_state.deep_search_url = None
|
2929 |
st.session_state.search_results = []
|
2930 |
+
# For RAG search
|
2931 |
+
st.session_state.rag_indexed = False
|
2932 |
+
st.session_state.rag_engine = None
|
2933 |
|
2934 |
with st.sidebar:
|
2935 |
mode = st.radio("Select Mode", ["Manual URL", "Bing Search"], key="mode_select")
|
|
|
3019 |
|
3020 |
if st.session_state.discovered_files:
|
3021 |
files = st.session_state.discovered_files
|
3022 |
+
|
3023 |
+
# Display files with direct download buttons
|
3024 |
+
download_dir = "./downloads"
|
3025 |
+
os.makedirs(download_dir, exist_ok=True)
|
3026 |
+
|
3027 |
+
# Add RAG Search interface
|
3028 |
+
st.markdown("### Search Within Discovered Files")
|
3029 |
+
search_query = st.text_input("Enter search terms", key="rag_search_query")
|
3030 |
+
|
3031 |
+
if st.button("Search Files", key="rag_search_btn") and search_query:
|
3032 |
+
# Initialize RAG search engine
|
3033 |
+
if not st.session_state.rag_indexed:
|
3034 |
+
rag_search = RAGSearch()
|
3035 |
+
|
3036 |
+
with st.spinner("Indexing files for search..."):
|
3037 |
+
# First download files to extract text
|
3038 |
+
temp_dir = "./temp_downloads"
|
3039 |
+
os.makedirs(temp_dir, exist_ok=True)
|
3040 |
+
|
3041 |
+
async def download_for_indexing():
|
3042 |
+
downloaded = 0
|
3043 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
|
3044 |
+
for i, file_info in enumerate(files):
|
3045 |
+
# Only process common text-based file formats
|
3046 |
+
ext = os.path.splitext(file_info['filename'])[1].lower()
|
3047 |
+
if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json']:
|
3048 |
+
path = await dm.download_file(file_info, temp_dir, url)
|
3049 |
+
if path:
|
3050 |
+
with open(path, 'rb') as f:
|
3051 |
+
file_data = f.read()
|
3052 |
+
|
3053 |
+
# Add to search index
|
3054 |
+
if rag_search.add_file(file_data, file_info):
|
3055 |
+
downloaded += 1
|
3056 |
+
|
3057 |
+
# Clean up
|
3058 |
+
os.remove(path)
|
3059 |
+
return downloaded
|
3060 |
+
|
3061 |
+
indexed_count = asyncio.run(download_for_indexing())
|
3062 |
+
if indexed_count > 0:
|
3063 |
+
rag_search.build_index()
|
3064 |
+
st.session_state.rag_engine = rag_search
|
3065 |
+
st.session_state.rag_indexed = True
|
3066 |
+
st.success(f"Indexed {indexed_count} files for search")
|
3067 |
+
else:
|
3068 |
+
st.warning("Could not index any files. Try with more text-based documents.")
|
3069 |
+
|
3070 |
+
# Perform the search
|
3071 |
+
if st.session_state.rag_indexed:
|
3072 |
+
search_results = st.session_state.rag_engine.search(search_query)
|
3073 |
+
|
3074 |
+
if search_results:
|
3075 |
+
st.write(f"Found {len(search_results)} relevant files:")
|
3076 |
+
|
3077 |
+
for result in search_results:
|
3078 |
+
file_info = result['file_info']
|
3079 |
+
score = result['score']
|
3080 |
+
with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"):
|
3081 |
+
st.write(f"Size: {file_info['size']}")
|
3082 |
+
if 'metadata' in file_info and file_info['metadata']:
|
3083 |
+
st.write("Metadata:")
|
3084 |
+
for k, v in file_info['metadata'].items():
|
3085 |
+
if k != 'file_id': # Skip technical details
|
3086 |
+
st.write(f"- {k}: {v}")
|
3087 |
+
|
3088 |
+
# Add direct download button
|
3089 |
+
if st.button(f"Download this file", key=f"rag_dl_{result['rank']}"):
|
3090 |
+
with st.spinner(f"Downloading {file_info['filename']}..."):
|
3091 |
+
async def download_search_result():
|
3092 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
|
3093 |
+
path = await dm.download_file(file_info, download_dir, url)
|
3094 |
+
return path
|
3095 |
+
|
3096 |
+
path = asyncio.run(download_search_result())
|
3097 |
+
if path:
|
3098 |
+
with open(path, "rb") as f:
|
3099 |
+
file_data = f.read()
|
3100 |
+
|
3101 |
+
st.download_button(
|
3102 |
+
label=f"Save {file_info['filename']}",
|
3103 |
+
data=file_data,
|
3104 |
+
file_name=file_info['filename'],
|
3105 |
+
mime=mimetypes.guess_type(path)[0] or "application/octet-stream",
|
3106 |
+
key=f"save_rag_{result['rank']}"
|
3107 |
+
)
|
3108 |
+
else:
|
3109 |
+
st.warning("No matching files found for your query.")
|
3110 |
+
|
3111 |
+
# Show all files with direct download options
|
3112 |
+
st.markdown("### All Discovered Files")
|
3113 |
+
|
3114 |
+
# Batch download options
|
3115 |
col1, col2 = st.columns([1, 4])
|
3116 |
with col1:
|
3117 |
if st.button("Select All", key="select_all_btn"):
|
|
|
3119 |
if st.button("Clear Selection", key="clear_selection_btn"):
|
3120 |
st.session_state.selected_files = []
|
3121 |
|
3122 |
+
# Batch download settings
|
3123 |
+
if 'selected_files' in st.session_state and st.session_state.selected_files:
|
3124 |
+
batch_col1, batch_col2, batch_col3, batch_col4 = st.columns(4)
|
3125 |
+
with batch_col1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3126 |
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
|
3127 |
+
with batch_col2:
|
3128 |
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
|
3129 |
+
with batch_col3:
|
3130 |
delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
|
3131 |
+
with batch_col4:
|
3132 |
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
|
3133 |
|
3134 |
+
if st.button("Download Selected", key="download_batch_btn"):
|
3135 |
if not os.path.exists(download_dir):
|
3136 |
os.makedirs(download_dir)
|
3137 |
|
|
|
3145 |
proxy=proxy,
|
3146 |
use_stealth=use_stealth
|
3147 |
) as dm:
|
3148 |
+
for i, idx in enumerate(st.session_state.selected_files):
|
3149 |
+
progress = (i + 1) / len(st.session_state.selected_files)
|
3150 |
file_info = files[idx]
|
3151 |
+
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(st.session_state.selected_files)})")
|
3152 |
progress_bar.progress(progress)
|
3153 |
|
3154 |
path = await dm.download_file(file_info, download_dir, url)
|
|
|
3199 |
except Exception as e:
|
3200 |
st.warning(f"Could not delete {path}: {e}")
|
3201 |
st.info("Deleted original files after ZIP creation")
|
3202 |
+
|
3203 |
+
# Individual file display with direct download buttons
|
3204 |
+
for i, file in enumerate(files):
|
3205 |
+
col1, col2, col3 = st.columns([3, 1, 1])
|
3206 |
+
with col1:
|
3207 |
+
filename = file['filename']
|
3208 |
+
size = file['size']
|
3209 |
+
meta = file.get('metadata', {})
|
3210 |
+
file_info = f"{filename} ({size})"
|
3211 |
+
if meta and 'Pages' in meta:
|
3212 |
+
file_info += f" - {meta.get('Pages', '')} pages"
|
3213 |
+
st.markdown(f"**{i+1}. {file_info}**")
|
3214 |
+
|
3215 |
+
with col2:
|
3216 |
+
# Add direct download button for each file
|
3217 |
+
if st.button(f"Download", key=f"direct_dl_{i}"):
|
3218 |
+
with st.spinner(f"Downloading {filename}..."):
|
3219 |
+
async def download_single_file():
|
3220 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
|
3221 |
+
path = await dm.download_file(file, download_dir, url)
|
3222 |
+
return path
|
3223 |
+
|
3224 |
+
downloaded_path = asyncio.run(download_single_file())
|
3225 |
+
if downloaded_path:
|
3226 |
+
with open(downloaded_path, "rb") as f:
|
3227 |
file_data = f.read()
|
3228 |
|
|
|
|
|
|
|
3229 |
st.download_button(
|
3230 |
+
label=f"Save {filename}",
|
3231 |
data=file_data,
|
3232 |
+
file_name=filename,
|
3233 |
+
mime=mimetypes.guess_type(downloaded_path)[0] or "application/octet-stream",
|
3234 |
+
key=f"save_file_{i}"
|
3235 |
)
|
3236 |
+
|
3237 |
+
with col3:
|
3238 |
+
# Add to selection for batch download
|
3239 |
+
if i in st.session_state.selected_files:
|
3240 |
+
if st.button("Unselect", key=f"unselect_{i}"):
|
3241 |
+
st.session_state.selected_files.remove(i)
|
3242 |
+
else:
|
3243 |
+
if st.button("Select", key=f"select_{i}"):
|
3244 |
+
st.session_state.selected_files.append(i)
|
3245 |
|
3246 |
elif mode == "Bing Search":
|
3247 |
st.header("Bing Search Mode")
|
|
|
3307 |
st.session_state.discovered_files = files
|
3308 |
st.session_state.current_url = url
|
3309 |
st.success(f"Found {len(files)} files!")
|
3310 |
+
|
3311 |
+
# Show files with direct download options
|
3312 |
+
download_dir = "./downloads"
|
3313 |
+
os.makedirs(download_dir, exist_ok=True)
|
3314 |
+
|
3315 |
+
for i, file in enumerate(files):
|
3316 |
+
col1, col2, col3 = st.columns([3, 1, 1])
|
3317 |
+
with col1:
|
3318 |
+
filename = file['filename']
|
3319 |
+
size = file['size']
|
3320 |
+
meta = file.get('metadata', {})
|
3321 |
+
file_info = f"{filename} ({size})"
|
3322 |
+
if meta and 'Pages' in meta:
|
3323 |
+
file_info += f" - {meta.get('Pages', '')} pages"
|
3324 |
+
st.markdown(f"**{i+1}. {file_info}**")
|
3325 |
+
|
3326 |
+
with col2:
|
3327 |
+
# Add direct download button for each file
|
3328 |
+
if st.button(f"Download", key=f"direct_dl_{i}"):
|
3329 |
+
with st.spinner(f"Downloading {filename}..."):
|
3330 |
+
async def download_single_file():
|
3331 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
|
3332 |
+
path = await dm.download_file(file, download_dir, url)
|
3333 |
+
return path
|
3334 |
+
|
3335 |
+
downloaded_path = asyncio.run(download_single_file())
|
3336 |
+
if downloaded_path:
|
3337 |
+
with open(downloaded_path, "rb") as f:
|
3338 |
+
file_data = f.read()
|
3339 |
+
|
3340 |
+
st.download_button(
|
3341 |
+
label=f"Save {filename}",
|
3342 |
+
data=file_data,
|
3343 |
+
file_name=filename,
|
3344 |
+
mime=mimetypes.guess_type(downloaded_path)[0] or "application/octet-stream",
|
3345 |
+
key=f"save_file_{i}"
|
3346 |
+
)
|
3347 |
+
|
3348 |
+
with col3:
|
3349 |
+
# Add to selection for batch download
|
3350 |
+
if i in st.session_state.selected_files:
|
3351 |
+
if st.button("Unselect", key=f"unselect_{i}"):
|
3352 |
+
st.session_state.selected_files.remove(i)
|
3353 |
+
else:
|
3354 |
+
if st.button("Select", key=f"select_{i}"):
|
3355 |
+
st.session_state.selected_files.append(i)
|
3356 |
else:
|
3357 |
st.warning("No files found.")
|
3358 |
|
|
|
3400 |
|
3401 |
# Add footer with attribution
|
3402 |
st.markdown('---')
|
3403 |
+
st.markdown('Created by [Euler314](https://github.com/yu314coder)')
|
3404 |
|
3405 |
if __name__ == "__main__":
|
3406 |
main()
|