|
import os |
|
import subprocess |
|
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError |
|
import asyncio |
|
import logging |
|
from urllib.parse import urlparse |
|
import re |
|
from pathlib import Path |
|
from io import BytesIO |
|
import random |
|
import streamlit as st |
|
from bs4 import BeautifulSoup |
|
from PyPDF2 import PdfReader |
|
import zipfile |
|
import tempfile |
|
import mimetypes |
|
import requests |
|
|
|
|
|
import spacy |
|
import spacy.cli |
|
from spacy.language import Language |
|
|
|
|
|
@Language.factory("spacy-curated-transformers_RobertaTransformer_v1") |
|
def dummy_roberta_transformer(nlp, name): |
|
|
|
def dummy(doc): |
|
return doc |
|
return dummy |
|
|
|
|
|
@st.cache_resource |
|
def load_nlp_model(): |
|
try: |
|
nlp_model = spacy.load("en_core_web_trf") |
|
except OSError: |
|
st.write("Model en_core_web_trf not found. Downloading it now...") |
|
spacy.cli.download("en_core_web_trf") |
|
try: |
|
nlp_model = spacy.load("en_core_web_trf") |
|
except Exception as e: |
|
st.error(f"Error loading model after download: {e}") |
|
st.write("Falling back to en_core_web_sm...") |
|
spacy.cli.download("en_core_web_sm") |
|
nlp_model = spacy.load("en_core_web_sm") |
|
return nlp_model |
|
|
|
nlp_model = load_nlp_model() |
|
|
|
|
|
from sentence_transformers import SentenceTransformer, util |
|
@st.cache_resource |
|
def load_semantic_model(): |
|
return SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
semantic_model = load_semantic_model() |
|
|
|
|
|
from transformers import pipeline |
|
@st.cache_resource |
|
def load_summarizer(): |
|
return pipeline("summarization") |
|
|
|
summarizer = load_summarizer() |
|
|
|
def summarize_pdf_url(pdf_url): |
|
""" |
|
Downloads a PDF from the given URL, extracts text using PyPDF2, |
|
and returns a summary of (up to) the first 3000 characters. |
|
""" |
|
try: |
|
with st.spinner("Downloading and processing PDF..."): |
|
response = requests.get(pdf_url, stream=True) |
|
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") |
|
with open(temp_pdf.name, "wb") as f: |
|
f.write(response.content) |
|
reader = PdfReader(temp_pdf.name) |
|
text = " ".join([page.extract_text() or "" for page in reader.pages]) |
|
os.remove(temp_pdf.name) |
|
limited_text = text[:3000] |
|
summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False) |
|
return summary[0]["summary_text"] |
|
except Exception as e: |
|
return f"Error summarizing PDF: {e}" |
|
|
|
|
|
GOOGLE_OAUTH_CONFIG = { |
|
"web": { |
|
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com", |
|
"project_id": "huggingface-449214", |
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth", |
|
"token_uri": "https://oauth2.googleapis.com/token", |
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", |
|
"client_secret": "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", |
|
"redirect_uris": ["https://euler314-craw-web.hf.space/"] |
|
} |
|
} |
|
|
|
import google_auth_oauthlib.flow |
|
import googleapiclient.discovery |
|
import google.auth.transport.requests |
|
|
|
def get_google_auth_url(): |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
authorization_url, _ = flow.authorization_url( |
|
access_type="offline", |
|
include_granted_scopes="true", |
|
prompt="consent" |
|
) |
|
return authorization_url |
|
|
|
def exchange_code_for_credentials(auth_code): |
|
if not auth_code.strip(): |
|
return None, "No code provided." |
|
try: |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
flow.fetch_token(code=auth_code.strip()) |
|
creds = flow.credentials |
|
if not creds or not creds.valid: |
|
return None, "Could not validate credentials. Check code and try again." |
|
return creds, "Google Sign-In successful!" |
|
except Exception as e: |
|
return None, f"Error during token exchange: {e}" |
|
|
|
def install_playwright_dependencies(): |
|
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright") |
|
os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu' |
|
try: |
|
subprocess.run(['apt-get', 'update', '-y'], check=True) |
|
packages = [ |
|
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0', |
|
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1', |
|
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0' |
|
] |
|
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True) |
|
os.makedirs('/usr/lib/playwright', exist_ok=True) |
|
symlinks = { |
|
'libnss3.so': '/usr/lib/x86_64-linux-gnu/libnss3.so', |
|
'libnssutil3.so': '/usr/lib/x86_64-linux-gnu/libnssutil3.so', |
|
'libsmime3.so': '/usr/lib/x86_64-linux-gnu/libsmime3.so', |
|
'libnspr4.so': '/usr/lib/x86_64-linux-gnu/libnspr4.so', |
|
'libatk-1.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-1.0.so.0', |
|
'libatk-bridge-2.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-bridge-2.0.so.0', |
|
'libcups.so.2': '/usr/lib/x86_64-linux-gnu/libcups.so.2', |
|
'libatspi.so.0': '/usr/lib/x86_64-linux-gnu/libatspi.so.0', |
|
'libXcomposite.so.1': '/usr/lib/x86_64-linux-gnu/libXcomposite.so.1', |
|
'libXdamage.so.1': '/usr/lib/x86_64-linux-gnu/libXdamage.so.1' |
|
} |
|
for link_name, target in symlinks.items(): |
|
link_path = os.path.join('/usr/lib/playwright', link_name) |
|
if not os.path.exists(link_path): |
|
os.symlink(target, link_path) |
|
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True) |
|
browser_path = os.path.expanduser("~/.cache/ms-playwright") |
|
os.makedirs(browser_path, exist_ok=True) |
|
subprocess.run(['chmod', '-R', '755', browser_path], check=True) |
|
except subprocess.CalledProcessError as e: |
|
st.error(f"Error installing dependencies: {e}") |
|
except Exception as e: |
|
st.error(f"Error: {e}") |
|
|
|
|
|
install_playwright_dependencies() |
|
|
|
|
|
logging.basicConfig( |
|
filename='advanced_download_log.txt', |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger() |
|
|
|
|
|
USER_AGENTS = [ |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15', |
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0', |
|
] |
|
|
|
def get_random_user_agent(): |
|
return random.choice(USER_AGENTS) |
|
|
|
def sizeof_fmt(num, suffix='B'): |
|
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: |
|
if abs(num) < 1024.0: |
|
return f"{num:3.1f}{unit}{suffix}" |
|
num /= 1024.0 |
|
return f"{num:.1f}Y{suffix}" |
|
|
|
|
|
async def human_like_scroll(page): |
|
scroll_height = await page.evaluate('document.body.scrollHeight') |
|
viewport_height = await page.evaluate('window.innerHeight') |
|
current_scroll = 0 |
|
while current_scroll < scroll_height: |
|
await page.evaluate(f'window.scrollTo(0, {current_scroll})') |
|
await asyncio.sleep(random.uniform(0.5, 1.5)) |
|
current_scroll += viewport_height * random.uniform(0.5, 1.5) |
|
scroll_height = await page.evaluate('document.body.scrollHeight') |
|
|
|
async def human_like_interactions(page): |
|
await page.mouse.move(random.randint(0, 1000), random.randint(0, 1000)) |
|
await asyncio.sleep(random.uniform(0.5, 1.5)) |
|
await page.mouse.click(random.randint(0, 1000), random.randint(0, 1000)) |
|
await asyncio.sleep(random.uniform(0.5, 1.5)) |
|
await page.evaluate("window.scrollBy(0, window.innerHeight / 2)") |
|
await asyncio.sleep(random.uniform(0.5, 1.5)) |
|
|
|
|
|
def nlp_preprocess(query: str) -> str: |
|
doc = nlp_model(query) |
|
tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha] |
|
processed = " ".join(tokens) |
|
return processed if processed.strip() else query |
|
|
|
def nlp_extract_entities(text: str): |
|
doc = nlp_model(text) |
|
return [(ent.text, ent.label_) for ent in doc.ents] |
|
|
|
|
|
def ai_preprocess_query(query: str) -> str: |
|
return query |
|
|
|
class DownloadManager: |
|
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5): |
|
self.use_proxy = use_proxy |
|
self.proxy = proxy |
|
self.query = query |
|
self.num_results = num_results |
|
self.playwright = None |
|
self.browser = None |
|
self.context = None |
|
self.page = None |
|
|
|
async def __aenter__(self): |
|
self.playwright = await async_playwright().start() |
|
opts = {"headless": True} |
|
if self.use_proxy and self.proxy: |
|
opts["proxy"] = {"server": self.proxy} |
|
self.browser = await self.playwright.chromium.launch(**opts) |
|
self.context = await self.browser.new_context(user_agent=get_random_user_agent()) |
|
self.page = await self.context.new_page() |
|
await self.page.set_extra_http_headers({ |
|
'Accept-Language': 'en-US,en;q=0.9', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': 'https://www.bing.com/' |
|
}) |
|
return self |
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb): |
|
if self.browser: |
|
await self.browser.close() |
|
if self.playwright: |
|
await self.playwright.stop() |
|
|
|
async def get_file_size(self, url): |
|
try: |
|
response = await self.page.request.head(url) |
|
length = response.headers.get('Content-Length', None) |
|
if length: |
|
return sizeof_fmt(int(length)) |
|
else: |
|
return "Unknown Size" |
|
except Exception: |
|
return "Unknown Size" |
|
|
|
async def get_pdf_metadata(self, url): |
|
try: |
|
resp = await self.page.request.get(url, timeout=15000) |
|
if resp.ok: |
|
content = await resp.body() |
|
pdf = BytesIO(content) |
|
reader = PdfReader(pdf) |
|
return { |
|
'Title': reader.metadata.title if reader.metadata.title else 'N/A', |
|
'Author': reader.metadata.author if reader.metadata.author else 'N/A', |
|
'Pages': len(reader.pages), |
|
} |
|
else: |
|
return {} |
|
except Exception: |
|
return {} |
|
|
|
async def search_bing(self): |
|
if not self.query: |
|
return [], [] |
|
query = self.query |
|
if "filetype:pdf" not in query.lower(): |
|
query += " filetype:pdf" |
|
if "site:" not in query.lower(): |
|
query += " site:edu OR site:arxiv.org OR site:openstax.org" |
|
query = ai_preprocess_query(query) |
|
query_processed = nlp_preprocess(query) |
|
logger.info(f"BING SEARCH NLP: Original='{query}' -> Processed='{query_processed}'") |
|
|
|
bing_url = f"https://www.bing.com/search?q={query_processed.replace(' ', '+')}&count={self.num_results}" |
|
try: |
|
await self.page.goto(bing_url, timeout=30000) |
|
await self.page.wait_for_selector('li.b_algo', timeout=30000) |
|
await human_like_scroll(self.page) |
|
html = await self.page.content() |
|
soup = BeautifulSoup(html, 'html.parser') |
|
raw_results = soup.find_all('li', class_='b_algo') |
|
url_list = [] |
|
info_list = [] |
|
snippets = [] |
|
|
|
for r in raw_results: |
|
link_tag = r.find('a') |
|
snippet_tag = r.find('p') |
|
snippet_text = snippet_tag.get_text(strip=True) if snippet_tag else "" |
|
snippets.append(snippet_text) |
|
entities = nlp_extract_entities(snippet_text) |
|
|
|
if link_tag and 'href' in link_tag.attrs: |
|
link_url = link_tag['href'] |
|
url_list.append(link_url) |
|
info_list.append({ |
|
'url': link_url, |
|
'snippet': snippet_text, |
|
'entities': entities |
|
}) |
|
if len(url_list) >= self.num_results: |
|
break |
|
|
|
query_emb = semantic_model.encode(query, convert_to_tensor=True) |
|
snippet_embs = semantic_model.encode(snippets, convert_to_tensor=True) |
|
scores = util.cos_sim(query_emb, snippet_embs)[0] |
|
sorted_indices = scores.argsort(descending=True).cpu().numpy().tolist() |
|
sorted_url_list = [url_list[i] for i in sorted_indices] |
|
sorted_info_list = [info_list[i] for i in sorted_indices] |
|
|
|
return sorted_url_list, sorted_info_list |
|
except PlaywrightTimeoutError: |
|
logger.error("Bing search timed out.") |
|
return [], [] |
|
except Exception as e: |
|
logger.error(f"Bing search error: {e}") |
|
return [], [] |
|
|
|
async def extract_downloadable_files(self, url, custom_ext_list): |
|
found_files = [] |
|
try: |
|
await self.page.goto(url, timeout=30000) |
|
await self.page.wait_for_load_state('networkidle', timeout=30000) |
|
await human_like_interactions(self.page) |
|
content = await self.page.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
default_exts = [ |
|
'.pdf', '.docx', '.zip', '.rar', '.exe', '.mp3', |
|
'.mp4', '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif' |
|
] |
|
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) |
|
|
|
anchors = soup.find_all('a', href=True) |
|
for a in anchors: |
|
href = a['href'].strip() |
|
if any(href.lower().endswith(ext) for ext in all_exts): |
|
if href.startswith('http'): |
|
file_url = href |
|
elif href.startswith('/'): |
|
parsed = urlparse(url) |
|
file_url = f"{parsed.scheme}://{parsed.netloc}{href}" |
|
else: |
|
continue |
|
|
|
size_str = await self.get_file_size(file_url) |
|
meta = {} |
|
if file_url.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(file_url) |
|
|
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta |
|
}) |
|
|
|
elif ("drive.google.com" in href) or ("drive.com" in href): |
|
file_id = None |
|
for pattern in [ |
|
r'/file/d/([^/]+)/', |
|
r'open\?id=([^&]+)', |
|
r'id=([^&]+)' |
|
]: |
|
match = re.search(pattern, href) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
|
|
if file_id: |
|
direct = f"https://drive.google.com/uc?export=download&id={file_id}" |
|
filename = f"drive_file_{file_id}" |
|
try: |
|
resp = await self.page.request.head(direct, timeout=15000) |
|
cd = resp.headers.get("Content-Disposition", "") |
|
if cd: |
|
mt = re.search(r'filename\*?="?([^";]+)', cd) |
|
if mt: |
|
filename = mt.group(1).strip('"').strip() |
|
else: |
|
ctype = resp.headers.get("Content-Type", "") |
|
ext_guess = mimetypes.guess_extension(ctype) or "" |
|
filename = f"drive_file_{file_id}{ext_guess}" |
|
except Exception: |
|
pass |
|
|
|
size_str = await self.get_file_size(direct) |
|
found_files.append({ |
|
'url': direct, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': {} |
|
}) |
|
|
|
return found_files |
|
except PlaywrightTimeoutError: |
|
logger.error(f"Timeout extracting from {url}") |
|
return [] |
|
except Exception as e: |
|
logger.error(f"Error extracting from {url}: {e}") |
|
return [] |
|
async def download_file(self, file_info, save_dir, referer): |
|
file_url = file_info['url'] |
|
fname = file_info['filename'] |
|
path = os.path.join(save_dir, fname) |
|
base, ext = os.path.splitext(fname) |
|
i = 1 |
|
while os.path.exists(path): |
|
path = os.path.join(save_dir, f"{base}({i}){ext}") |
|
i += 1 |
|
|
|
os.makedirs(save_dir, exist_ok=True) |
|
try: |
|
if file_url.lower().endswith(".pdf") and "drive.google.com" not in file_url.lower(): |
|
response = requests.get(file_url, stream=True) |
|
with open(path, "wb") as f: |
|
f.write(response.content) |
|
logger.info(f"Directly downloaded PDF: {path}") |
|
return path |
|
|
|
if "drive.google.com" in file_url.lower(): |
|
import gdown |
|
try: |
|
result = gdown.download(file_url, output=path, quiet=False, fuzzy=True) |
|
if result is None: |
|
logger.error(f"gdown failed to download: {file_url}") |
|
return None |
|
current_ext = os.path.splitext(path)[1].lower() |
|
allowed_exts = {'.pdf', '.jpg', '.jpeg', '.png', '.docx', '.zip', '.rar', '.mp3', '.mp4', '.avi', '.mkv'} |
|
if current_ext not in allowed_exts: |
|
try: |
|
r = requests.head(file_url, allow_redirects=True, timeout=15) |
|
ctype = r.headers.get("Content-Type", "") |
|
guessed_ext = mimetypes.guess_extension(ctype) or ".pdf" |
|
except Exception as e: |
|
logger.error(f"Error in HEAD request for extension: {e}") |
|
guessed_ext = ".pdf" |
|
new_path = os.path.splitext(path)[0] + guessed_ext |
|
os.rename(path, new_path) |
|
path = new_path |
|
logger.info(f"Downloaded using gdown: {path}") |
|
return path |
|
except Exception as e: |
|
logger.error(f"Error downloading using gdown: {e}") |
|
return None |
|
|
|
headers = { |
|
'Accept-Language': 'en-US,en;q=0.9', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': referer |
|
} |
|
await human_like_interactions(self.page) |
|
resp = await self.page.request.get(file_url, headers=headers, timeout=30000) |
|
if resp.status == 403: |
|
logger.error(f"403 Forbidden: {file_url}") |
|
return None |
|
if not resp.ok: |
|
logger.error(f"Failed to download {file_url}: Status {resp.status}") |
|
return None |
|
data = await resp.body() |
|
with open(path, 'wb') as f: |
|
f.write(data) |
|
logger.info(f"Downloaded: {path}") |
|
return path |
|
except PlaywrightTimeoutError: |
|
logger.error(f"Timeout downloading {file_url}") |
|
return None |
|
except Exception as e: |
|
logger.error(f"Error downloading {file_url}: {e}") |
|
return None |
|
|
|
async def deep_search(self, url, custom_ext_list, sublink_limit=2000, max_concurrency=500): |
|
progress_text = st.empty() |
|
progress_bar = st.progress(0) |
|
|
|
progress_text.text("Analyzing main page...") |
|
all_files = [] |
|
main_files = await self.extract_downloadable_files(url, custom_ext_list) |
|
all_files.extend(main_files) |
|
|
|
progress_text.text("Getting sublinks...") |
|
sublinks = await self.get_sublinks(url, sublink_limit) |
|
total_links = len(sublinks) |
|
|
|
progress_text.text(f"Processing {total_links} sublinks...") |
|
sem = asyncio.Semaphore(max_concurrency) |
|
|
|
async def analyze_one_sublink(link, idx): |
|
async with sem: |
|
progress_text.text(f"Processing link {idx}/{total_links}: {link}") |
|
progress_bar.progress(idx/total_links) |
|
return await self.extract_downloadable_files(link, custom_ext_list) |
|
|
|
tasks = [analyze_one_sublink(link, i) for i, link in enumerate(sublinks, 1)] |
|
sub_results = await asyncio.gather(*tasks) |
|
|
|
for sr in sub_results: |
|
all_files.extend(sr) |
|
|
|
unique_map = {f['url']: f for f in all_files} |
|
combined = list(unique_map.values()) |
|
|
|
progress_text.text(f"Found {len(combined)} unique files.") |
|
progress_bar.progress(1.0) |
|
return combined |
|
|
|
async def get_sublinks(self, url, limit=20000): |
|
try: |
|
await self.page.goto(url, timeout=30000) |
|
content = await self.page.content() |
|
soup = BeautifulSoup(content, "html.parser") |
|
links = [] |
|
for a in soup.find_all('a', href=True): |
|
href = a['href'].strip() |
|
if href.startswith('http'): |
|
links.append(href) |
|
elif href.startswith('/'): |
|
parsed = urlparse(url) |
|
links.append(f"{parsed.scheme}://{parsed.netloc}{href}") |
|
return list(set(links))[:limit] |
|
except Exception as e: |
|
logger.error(f"Error getting sublinks: {e}") |
|
return [] |
|
|
|
def main(): |
|
st.set_page_config(page_title="Advanced File Downloader", layout="wide") |
|
|
|
if 'session_state' not in st.session_state: |
|
st.session_state.session_state = { |
|
'discovered_files': [], |
|
'current_url': None, |
|
'download_manager': None, |
|
'google_creds': None |
|
} |
|
|
|
st.title("Advanced File Downloader") |
|
|
|
mode = st.sidebar.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"]) |
|
|
|
with st.sidebar.expander("Advanced Options"): |
|
custom_extensions = st.text_input( |
|
"Custom File Extensions", |
|
placeholder=".csv, .txt, .epub" |
|
) |
|
max_concurrency = st.slider( |
|
"Max Concurrency", |
|
min_value=1, |
|
max_value=1000, |
|
value=200 |
|
) |
|
use_proxy = st.checkbox("Use Proxy") |
|
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port") |
|
|
|
|
|
with st.expander("Google Drive Integration"): |
|
if st.button("Start Google Sign-In"): |
|
auth_url = get_google_auth_url() |
|
st.markdown(f"[Click here to authorize]({auth_url})") |
|
|
|
auth_code = st.text_input("Enter authorization code") |
|
if st.button("Complete Sign-In") and auth_code: |
|
creds, msg = exchange_code_for_credentials(auth_code) |
|
st.session_state.session_state['google_creds'] = creds |
|
st.write(msg) |
|
|
|
if mode == "Manual URL": |
|
manual_url_mode() |
|
elif mode == "Bing Search": |
|
bing_search_mode() |
|
else: |
|
pdf_summarizer_mode() |
|
|
|
def manual_url_mode(): |
|
st.header("Manual URL Mode") |
|
|
|
url = st.text_input("Enter URL", placeholder="https://example.com") |
|
|
|
if st.button("Deep Search"): |
|
if url: |
|
async def run_deep_search(): |
|
async with DownloadManager( |
|
use_proxy=st.session_state.get('use_proxy', False), |
|
proxy=st.session_state.get('proxy', None) |
|
) as dm: |
|
files = await dm.deep_search( |
|
url=url, |
|
custom_ext_list=st.session_state.get('custom_extensions', '').split(','), |
|
max_concurrency=st.session_state.get('max_concurrency', 200) |
|
) |
|
st.session_state.session_state['discovered_files'] = files |
|
st.session_state.session_state['current_url'] = url |
|
|
|
if files: |
|
st.write(f"Found {len(files)} files:") |
|
for f in files: |
|
st.write(f"- {f['filename']} ({f['size']})") |
|
else: |
|
st.warning("No files found.") |
|
|
|
asyncio.run(run_deep_search()) |
|
|
|
def bing_search_mode(): |
|
st.header("Bing Search Mode") |
|
|
|
query = st.text_input("Enter search query") |
|
num_results = st.slider("Number of results", 1, 50, 5) |
|
|
|
if st.button("Search"): |
|
if query: |
|
async def run_search(): |
|
async with DownloadManager( |
|
use_proxy=st.session_state.get('use_proxy', False), |
|
proxy=st.session_state.get('proxy', None), |
|
query=query, |
|
num_results=num_results |
|
) as dm: |
|
urls, info = await dm.search_bing() |
|
if urls: |
|
st.write("Search Results:") |
|
for i, (url, info) in enumerate(zip(urls, info), 1): |
|
st.write(f"{i}. {url}") |
|
st.write(f" Snippet: {info['snippet']}") |
|
else: |
|
st.warning("No results found.") |
|
|
|
asyncio.run(run_search()) |
|
|
|
def pdf_summarizer_mode(): |
|
st.header("PDF Summarizer") |
|
|
|
pdf_url = st.text_input("Enter PDF URL") |
|
|
|
if st.button("Summarize"): |
|
if pdf_url: |
|
summary = summarize_pdf_url(pdf_url) |
|
st.write("Summary:") |
|
st.write(summary) |
|
|
|
if __name__ == "__main__": |
|
main() |