|
import streamlit as st |
|
|
|
st.set_page_config(page_title="Advanced File Downloader", layout="wide") |
|
|
|
import os |
|
import subprocess |
|
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError |
|
import asyncio |
|
import logging |
|
from urllib.parse import urlparse |
|
import re |
|
from pathlib import Path |
|
from io import BytesIO |
|
import random |
|
from bs4 import BeautifulSoup |
|
from PyPDF2 import PdfReader |
|
import zipfile |
|
import tempfile |
|
import mimetypes |
|
import requests |
|
|
|
|
|
import spacy |
|
import spacy.cli |
|
from spacy.language import Language |
|
|
|
@Language.factory("spacy-curated-transformers_RobertaTransformer_v1") |
|
def dummy_roberta_transformer(nlp, name): |
|
def dummy(doc): |
|
return doc |
|
return dummy |
|
|
|
@st.cache_resource |
|
def load_nlp_model(): |
|
try: |
|
nlp_model = spacy.load("en_core_web_sm") |
|
except OSError: |
|
st.write("Model en_core_web_sm not found. Downloading it now...") |
|
spacy.cli.download("en_core_web_sm") |
|
nlp_model = spacy.load("en_core_web_sm") |
|
return nlp_model |
|
|
|
nlp_model = load_nlp_model() |
|
|
|
|
|
from sentence_transformers import SentenceTransformer, util |
|
@st.cache_resource |
|
def load_semantic_model(): |
|
return SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
semantic_model = load_semantic_model() |
|
|
|
|
|
from transformers import pipeline |
|
@st.cache_resource |
|
def load_summarizer(): |
|
return pipeline("summarization") |
|
|
|
summarizer = load_summarizer() |
|
|
|
def summarize_pdf_url(pdf_url): |
|
try: |
|
with st.spinner("Downloading and processing PDF..."): |
|
response = requests.get(pdf_url, stream=True) |
|
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") |
|
with open(temp_pdf.name, "wb") as f: |
|
f.write(response.content) |
|
reader = PdfReader(temp_pdf.name) |
|
text = " ".join([page.extract_text() or "" for page in reader.pages]) |
|
os.remove(temp_pdf.name) |
|
limited_text = text[:3000] |
|
summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False) |
|
return summary[0]["summary_text"] |
|
except Exception as e: |
|
return f"Error summarizing PDF: {e}" |
|
|
|
|
|
GOOGLE_OAUTH_CONFIG = { |
|
"web": { |
|
"client_id": "your_client_id", |
|
"project_id": "your_project_id", |
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth", |
|
"token_uri": "https://oauth2.googleapis.com/token", |
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", |
|
"client_secret": "your_client_secret", |
|
"redirect_uris": ["your_redirect_uri"] |
|
} |
|
} |
|
|
|
import google_auth_oauthlib.flow |
|
import googleapiclient.discovery |
|
import google.auth.transport.requests |
|
|
|
def get_google_auth_url(): |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
authorization_url, _ = flow.authorization_url( |
|
access_type="offline", |
|
include_granted_scopes="true", |
|
prompt="consent" |
|
) |
|
return authorization_url |
|
|
|
def exchange_code_for_credentials(auth_code): |
|
if not auth_code.strip(): |
|
return None, "No code provided." |
|
try: |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
flow.fetch_token(code=auth_code.strip()) |
|
creds = flow.credentials |
|
if not creds or not creds.valid: |
|
return None, "Could not validate credentials. Check code and try again." |
|
return creds, "Google Sign-In successful!" |
|
except Exception as e: |
|
return None, f"Error during token exchange: {e}" |
|
|
|
|
|
def install_playwright_dependencies(): |
|
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright") |
|
os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu' |
|
try: |
|
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True) |
|
except Exception as e: |
|
st.error(f"Error installing Playwright: {e}") |
|
|
|
|
|
install_playwright_dependencies() |
|
|
|
|
|
logging.basicConfig( |
|
filename='advanced_download_log.txt', |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger() |
|
|
|
|
|
USER_AGENTS = [ |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15', |
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0', |
|
] |
|
|
|
def get_random_user_agent(): |
|
return random.choice(USER_AGENTS) |
|
|
|
def sizeof_fmt(num, suffix='B'): |
|
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: |
|
if abs(num) < 1024.0: |
|
return f"{num:3.1f}{unit}{suffix}" |
|
num /= 1024.0 |
|
return f"{num:.1f}Y{suffix}" |
|
|
|
async def human_like_scroll(page): |
|
scroll_height = await page.evaluate('document.body.scrollHeight') |
|
viewport_height = await page.evaluate('window.innerHeight') |
|
current_scroll = 0 |
|
while current_scroll < scroll_height: |
|
await page.evaluate(f'window.scrollTo(0, {current_scroll})') |
|
await asyncio.sleep(random.uniform(0.5, 1.5)) |
|
current_scroll += viewport_height * random.uniform(0.5, 1.5) |
|
scroll_height = await page.evaluate('document.body.scrollHeight') |
|
|
|
async def human_like_interactions(page): |
|
await page.mouse.move(random.randint(0, 1000), random.randint(0, 1000)) |
|
await asyncio.sleep(random.uniform(0.5, 1.5)) |
|
await page.mouse.click(random.randint(0, 1000), random.randint(0, 1000)) |
|
await asyncio.sleep(random.uniform(0.5, 1.5)) |
|
await page.evaluate("window.scrollBy(0, window.innerHeight / 2)") |
|
await asyncio.sleep(random.uniform(0.5, 1.5)) |
|
|
|
|
|
def nlp_preprocess(query: str) -> str: |
|
doc = nlp_model(query) |
|
tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha] |
|
processed = " ".join(tokens) |
|
return processed if processed.strip() else query |
|
|
|
def nlp_extract_entities(text: str): |
|
doc = nlp_model(text) |
|
return [(ent.text, ent.label_) for ent in doc.ents] |
|
|
|
|
|
def ai_preprocess_query(query: str) -> str: |
|
return query |
|
class DownloadManager: |
|
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5): |
|
self.use_proxy = use_proxy |
|
self.proxy = proxy |
|
self.query = query |
|
self.num_results = num_results |
|
self.playwright = None |
|
self.browser = None |
|
self.context = None |
|
self.page = None |
|
|
|
async def __aenter__(self): |
|
self.playwright = await async_playwright().start() |
|
opts = {"headless": True} |
|
if self.use_proxy and self.proxy: |
|
opts["proxy"] = {"server": self.proxy} |
|
self.browser = await self.playwright.chromium.launch(**opts) |
|
self.context = await self.browser.new_context(user_agent=get_random_user_agent()) |
|
self.page = await self.context.new_page() |
|
await self.page.set_extra_http_headers({ |
|
'Accept-Language': 'en-US,en;q=0.9', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': 'https://www.bing.com/' |
|
}) |
|
return self |
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb): |
|
if self.browser: |
|
await self.browser.close() |
|
if self.playwright: |
|
await self.playwright.stop() |
|
|
|
async def get_file_size(self, url): |
|
try: |
|
async with self.context.new_page() as page: |
|
response = await page.request.head(url, timeout=15000) |
|
length = response.headers.get('Content-Length', None) |
|
if length: |
|
return sizeof_fmt(int(length)) |
|
else: |
|
return "Unknown Size" |
|
except Exception: |
|
return "Unknown Size" |
|
|
|
async def get_pdf_metadata(self, url): |
|
try: |
|
async with self.context.new_page() as page: |
|
resp = await page.request.get(url, timeout=15000) |
|
if resp.ok: |
|
content = await resp.body() |
|
pdf = BytesIO(content) |
|
reader = PdfReader(pdf) |
|
return { |
|
'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', |
|
'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', |
|
'Pages': len(reader.pages), |
|
} |
|
else: |
|
return {} |
|
except Exception: |
|
return {} |
|
|
|
async def extract_real_download_url(self, url): |
|
try: |
|
async with self.context.new_page() as page: |
|
response = await page.goto(url, wait_until='networkidle', timeout=30000) |
|
|
|
|
|
if response and response.headers.get('location'): |
|
return response.headers['location'] |
|
|
|
|
|
content_type = response.headers.get('content-type', '') |
|
if 'text/html' not in content_type.lower(): |
|
return url |
|
|
|
|
|
content = await page.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
meta_refresh = soup.find('meta', {'http-equiv': 'refresh'}) |
|
if meta_refresh: |
|
content = meta_refresh.get('content', '') |
|
if 'url=' in content.lower(): |
|
return content.split('url=')[-1].strip() |
|
|
|
return page.url |
|
except Exception as e: |
|
logger.error(f"Error extracting real download URL: {e}") |
|
return url |
|
|
|
async def extract_downloadable_files(self, url, custom_ext_list): |
|
found_files = [] |
|
try: |
|
|
|
response = await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
if not response: |
|
return [] |
|
|
|
final_url = self.page.url |
|
|
|
|
|
if '.php' in final_url or 'download' in final_url or 'get' in final_url: |
|
real_url = await self.extract_real_download_url(final_url) |
|
if real_url != final_url: |
|
content_type = (await self.page.request.head(real_url)).headers.get('content-type', '') |
|
if content_type and 'text/html' not in content_type.lower(): |
|
found_files.append({ |
|
'url': real_url, |
|
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', |
|
'size': await self.get_file_size(real_url), |
|
'metadata': {} |
|
}) |
|
return found_files |
|
|
|
await self.page.wait_for_load_state('networkidle', timeout=30000) |
|
await human_like_interactions(self.page) |
|
|
|
content = await self.page.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
|
|
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', '.avi', '.mkv', |
|
'.png', '.jpg', '.jpeg', '.gif', '.xlsx', '.xls', '.ppt', '.pptx', '.txt'] |
|
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) |
|
|
|
|
|
parsed_base = urlparse(final_url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
|
|
|
|
for a in soup.find_all('a', href=True): |
|
href = a['href'].strip() |
|
|
|
|
|
if not href or href.startswith('javascript:') or href == '#': |
|
continue |
|
|
|
|
|
if '.php' in href.lower() or 'download' in href.lower() or 'get' in href.lower(): |
|
full_url = href if href.startswith('http') else urljoin(base_url, href) |
|
real_url = await self.extract_real_download_url(full_url) |
|
if real_url and real_url != full_url: |
|
size_str = await self.get_file_size(real_url) |
|
found_files.append({ |
|
'url': real_url, |
|
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', |
|
'size': size_str, |
|
'metadata': {} |
|
}) |
|
continue |
|
|
|
|
|
if any(href.lower().endswith(ext) for ext in all_exts): |
|
file_url = href if href.startswith('http') else urljoin(base_url, href) |
|
size_str = await self.get_file_size(file_url) |
|
meta = {} |
|
|
|
if file_url.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(file_url) |
|
|
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(urlparse(file_url).path), |
|
'size': size_str, |
|
'metadata': meta |
|
}) |
|
|
|
|
|
elif any(x in href for x in ['drive.google.com', 'docs.google.com']): |
|
file_id = None |
|
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: |
|
match = re.search(pattern, href) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
|
|
if file_id: |
|
direct_url = f"https://drive.google.com/uc?export=download&id={file_id}" |
|
async with self.context.new_page() as page: |
|
try: |
|
response = await page.request.head(direct_url, timeout=15000) |
|
filename = file_id |
|
content_disposition = response.headers.get('content-disposition', '') |
|
if content_disposition: |
|
filename_match = re.findall('filename="(.+?)"', content_disposition) |
|
if filename_match: |
|
filename = filename_match[0] |
|
|
|
found_files.append({ |
|
'url': direct_url, |
|
'filename': filename, |
|
'size': await self.get_file_size(direct_url), |
|
'metadata': {} |
|
}) |
|
except Exception as e: |
|
logger.error(f"Error processing Google Drive link: {e}") |
|
|
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
for f in found_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
|
|
return unique_files |
|
|
|
except Exception as e: |
|
logger.error(f"Error extracting files from {url}: {e}") |
|
return [] |
|
|
|
async def download_file(self, file_info, save_dir, referer): |
|
file_url = file_info['url'] |
|
fname = file_info['filename'] |
|
path = os.path.join(save_dir, fname) |
|
|
|
|
|
base, ext = os.path.splitext(fname) |
|
counter = 1 |
|
while os.path.exists(path): |
|
path = os.path.join(save_dir, f"{base}_{counter}{ext}") |
|
counter += 1 |
|
|
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
try: |
|
|
|
if 'drive.google.com' in file_url: |
|
import gdown |
|
try: |
|
st.write(f"Downloading from Google Drive: {fname}") |
|
output = gdown.download(file_url, path, quiet=False) |
|
if output: |
|
return path |
|
return None |
|
except Exception as e: |
|
logger.error(f"Google Drive download error: {e}") |
|
return None |
|
|
|
|
|
async with self.context.new_page() as page: |
|
st.write(f"Downloading: {fname}") |
|
|
|
headers = { |
|
'Accept': '*/*', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': referer |
|
} |
|
|
|
response = await page.request.get(file_url, headers=headers, timeout=30000) |
|
|
|
if response.status == 200: |
|
content = await response.body() |
|
with open(path, 'wb') as f: |
|
f.write(content) |
|
return path |
|
else: |
|
logger.error(f"Download failed with status {response.status}: {file_url}") |
|
return None |
|
|
|
except Exception as e: |
|
logger.error(f"Error downloading {file_url}: {e}") |
|
return None |
|
|
|
async def search_bing(self): |
|
if not self.query: |
|
return [], [] |
|
|
|
search_query = self.query |
|
if "filetype:pdf" not in search_query.lower(): |
|
search_query += " filetype:pdf" |
|
|
|
search_url = f"https://www.bing.com/search?q={search_query}&count={self.num_results}" |
|
|
|
try: |
|
await self.page.goto(search_url, timeout=30000) |
|
await self.page.wait_for_selector('li.b_algo', timeout=30000) |
|
await human_like_scroll(self.page) |
|
|
|
results = [] |
|
elements = await self.page.query_selector_all('li.b_algo') |
|
|
|
for element in elements: |
|
link = await element.query_selector('h2 a') |
|
if link: |
|
url = await link.get_attribute('href') |
|
if url: |
|
results.append(url) |
|
|
|
return results[:self.num_results] |
|
|
|
except Exception as e: |
|
logger.error(f"Bing search error: {e}") |
|
return [] |
|
|
|
async def get_sublinks(self, url, limit=100): |
|
try: |
|
await self.page.goto(url, timeout=30000) |
|
content = await self.page.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
|
|
links = set() |
|
for a in soup.find_all('a', href=True): |
|
href = a['href'].strip() |
|
if href.startswith('http'): |
|
links.add(href) |
|
elif href.startswith('/'): |
|
links.add(f"{base_url}{href}") |
|
|
|
return list(links)[:limit] |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting sublinks: {e}") |
|
return [] |
|
|
|
async def deep_search(self, url, custom_ext_list=None, sublink_limit=100): |
|
if not custom_ext_list: |
|
custom_ext_list = [] |
|
|
|
progress_text = st.empty() |
|
progress_bar = st.progress(0) |
|
|
|
try: |
|
|
|
progress_text.text("Analyzing main page...") |
|
main_files = await self.extract_downloadable_files(url, custom_ext_list) |
|
|
|
|
|
progress_text.text("Getting sublinks...") |
|
sublinks = await self.get_sublinks(url, sublink_limit) |
|
|
|
if not sublinks: |
|
progress_bar.progress(1.0) |
|
return main_files |
|
|
|
|
|
all_files = main_files |
|
total_links = len(sublinks) |
|
|
|
for i, sublink in enumerate(sublinks, 1): |
|
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}") |
|
progress_bar.progress(i/total_links) |
|
|
|
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list) |
|
all_files.extend(sub_files) |
|
|
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
for f in all_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
|
|
progress_text.text(f"Found {len(unique_files)} unique files") |
|
progress_bar.progress(1.0) |
|
|
|
return unique_files |
|
|
|
except Exception as e: |
|
logger.error(f"Deep search error: {e}") |
|
return [] |
|
|
|
|
|
def main(): |
|
if 'initialized' not in st.session_state: |
|
st.session_state.initialized = True |
|
st.session_state.discovered_files = [] |
|
st.session_state.current_url = None |
|
st.session_state.google_creds = None |
|
|
|
st.title("Advanced File Downloader") |
|
|
|
|
|
with st.sidebar: |
|
st.header("Settings") |
|
mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"]) |
|
|
|
with st.expander("Advanced Options"): |
|
custom_extensions = st.text_input( |
|
"Custom File Extensions", |
|
placeholder=".csv, .txt, .epub" |
|
) |
|
max_concurrency = st.slider( |
|
"Max Concurrency", |
|
min_value=1, |
|
max_value=1000, |
|
value=200 |
|
) |
|
use_proxy = st.checkbox("Use Proxy") |
|
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port") |
|
|
|
|
|
with st.expander("Google Drive Integration"): |
|
if st.button("Start Google Sign-In"): |
|
auth_url = get_google_auth_url() |
|
st.markdown(f"[Click here to authorize]({auth_url})") |
|
|
|
auth_code = st.text_input("Enter authorization code") |
|
if st.button("Complete Sign-In") and auth_code: |
|
creds, msg = exchange_code_for_credentials(auth_code) |
|
st.session_state.google_creds = creds |
|
st.write(msg) |
|
|
|
|
|
if mode == "Manual URL": |
|
st.header("Manual URL Mode") |
|
url = st.text_input("Enter URL", placeholder="https://example.com") |
|
|
|
col1, col2 = st.columns(2) |
|
with col1: |
|
if st.button("Deep Search", use_container_width=True): |
|
if url: |
|
async def run_deep_search(): |
|
async with DownloadManager( |
|
use_proxy=use_proxy, |
|
proxy=proxy |
|
) as dm: |
|
with st.spinner("Searching for files..."): |
|
files = await dm.deep_search( |
|
url=url, |
|
custom_ext_list=custom_extensions.split(',') if custom_extensions else [], |
|
max_concurrency=max_concurrency |
|
) |
|
st.session_state.discovered_files = files |
|
st.session_state.current_url = url |
|
return files |
|
|
|
files = asyncio.run(run_deep_search()) |
|
if files: |
|
st.success(f"Found {len(files)} files!") |
|
else: |
|
st.warning("No files found.") |
|
|
|
with col2: |
|
if st.button("Preview Page", use_container_width=True): |
|
if url: |
|
async def preview(): |
|
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm: |
|
with st.spinner("Loading preview..."): |
|
return await dm.preview_page(url) |
|
|
|
preview_html = asyncio.run(preview()) |
|
st.markdown(preview_html, unsafe_allow_html=True) |
|
|
|
|
|
if st.session_state.discovered_files: |
|
with st.expander("Download Options", expanded=True): |
|
file_options = [f"{f['filename']} ({f['size']})" for f in st.session_state.discovered_files] |
|
selected_indices = st.multiselect( |
|
"Select files to download", |
|
range(len(file_options)), |
|
format_func=lambda x: file_options[x] |
|
) |
|
|
|
if selected_indices: |
|
download_dir = st.text_input("Download Directory", value="./downloads") |
|
delete_after = st.checkbox("Delete after creating ZIP?") |
|
upload_drive = st.checkbox("Upload to Google Drive?") |
|
|
|
if st.button("Download Selected"): |
|
selected_files = [st.session_state.discovered_files[i] for i in selected_indices] |
|
async def download_files(): |
|
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm: |
|
paths = [] |
|
for file_info in selected_files: |
|
with st.spinner(f"Downloading {file_info['filename']}..."): |
|
path = await dm.download_file( |
|
file_info, |
|
download_dir, |
|
st.session_state.current_url |
|
) |
|
if path: |
|
paths.append(path) |
|
return paths |
|
|
|
downloaded_paths = asyncio.run(download_files()) |
|
if downloaded_paths: |
|
st.success(f"Successfully downloaded {len(downloaded_paths)} files!") |
|
|
|
|
|
if len(downloaded_paths) > 1 or delete_after or upload_drive: |
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as tmp: |
|
with zipfile.ZipFile(tmp.name, 'w') as zf: |
|
for p in downloaded_paths: |
|
zf.write(p, arcname=os.path.basename(p)) |
|
|
|
if upload_drive and st.session_state.google_creds: |
|
file_id = google_drive_upload(tmp.name, st.session_state.google_creds) |
|
if file_id and not isinstance(file_id, str): |
|
st.success(f"Uploaded to Google Drive! File ID: {file_id}") |
|
else: |
|
st.error("Failed to upload to Google Drive") |
|
|
|
if delete_after: |
|
for p in downloaded_paths: |
|
try: |
|
os.remove(p) |
|
except: |
|
pass |
|
|
|
elif mode == "Bing Search": |
|
st.header("Bing Search Mode") |
|
query = st.text_input("Enter search query") |
|
num_results = st.slider("Number of results", 1, 50, 5) |
|
|
|
if st.button("Search"): |
|
if query: |
|
async def run_search(): |
|
async with DownloadManager( |
|
use_proxy=use_proxy, |
|
proxy=proxy, |
|
query=query, |
|
num_results=num_results |
|
) as dm: |
|
with st.spinner("Searching..."): |
|
return await dm.search_bing() |
|
|
|
urls, info = asyncio.run(run_search()) |
|
if urls: |
|
st.success(f"Found {len(urls)} results!") |
|
for i, (url, info) in enumerate(zip(urls, info), 1): |
|
with st.expander(f"Result {i}: {url}", expanded=i==1): |
|
st.write(f"Snippet: {info['snippet']}") |
|
if info['entities']: |
|
st.write("Entities:", ', '.join(f"{e[0]} ({e[1]})" for e in info['entities'])) |
|
|
|
if st.button(f"Deep Search This Result {i}"): |
|
st.session_state.current_url = url |
|
async def search_result(): |
|
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm: |
|
return await dm.deep_search( |
|
url=url, |
|
custom_ext_list=custom_extensions.split(',') if custom_extensions else [], |
|
max_concurrency=max_concurrency |
|
) |
|
|
|
files = asyncio.run(search_result()) |
|
if files: |
|
st.session_state.discovered_files = files |
|
st.success(f"Found {len(files)} files!") |
|
else: |
|
st.warning("No files found.") |
|
else: |
|
st.warning("No results found.") |
|
|
|
else: |
|
st.header("PDF Summarizer") |
|
pdf_url = st.text_input("Enter PDF URL") |
|
|
|
if st.button("Summarize"): |
|
if pdf_url: |
|
summary = summarize_pdf_url(pdf_url) |
|
st.write("Summary:") |
|
st.write(summary) |
|
|
|
if __name__ == "__main__": |
|
main() |