Update app.py
Browse files
app.py
CHANGED
@@ -25,6 +25,10 @@ import shutil
|
|
25 |
from PIL import Image
|
26 |
from reportlab.lib.pagesizes import letter
|
27 |
from reportlab.pdfgen import canvas
|
|
|
|
|
|
|
|
|
28 |
|
29 |
# -------------------- Logging Setup --------------------
|
30 |
logging.basicConfig(
|
@@ -33,6 +37,19 @@ logging.basicConfig(
|
|
33 |
)
|
34 |
logger = logging.getLogger(__name__)
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
# -------------------- Utility Functions --------------------
|
37 |
def get_random_user_agent():
|
38 |
USER_AGENTS = [
|
@@ -58,6 +75,56 @@ def create_zip_file(file_paths, output_dir):
|
|
58 |
zipf.write(file_path, os.path.basename(file_path))
|
59 |
return zip_path
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
# -------------------- Playwright Setup --------------------
|
62 |
def install_playwright_dependencies():
|
63 |
try:
|
@@ -65,9 +132,17 @@ def install_playwright_dependencies():
|
|
65 |
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
|
66 |
|
67 |
# Install system dependencies
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
subprocess.run(['pip', 'install', 'playwright'], check=True)
|
69 |
-
subprocess.run(['playwright', 'install', 'chromium'], check=True)
|
70 |
-
subprocess.run(['playwright', 'install-deps', 'chromium'], check=True)
|
71 |
|
72 |
st.success("Playwright dependencies installed successfully!")
|
73 |
except Exception as e:
|
@@ -1664,6 +1739,7 @@ def main():
|
|
1664 |
st.session_state.initialized = True
|
1665 |
st.session_state.discovered_files = []
|
1666 |
st.session_state.current_url = None
|
|
|
1667 |
st.session_state.selected_files = []
|
1668 |
st.session_state.do_deep_search = False
|
1669 |
st.session_state.deep_search_url = None
|
@@ -1678,6 +1754,16 @@ def main():
|
|
1678 |
use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
|
1679 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
|
1680 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1681 |
if mode == "Manual URL":
|
1682 |
st.header("Manual URL Mode")
|
1683 |
url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
|
@@ -1744,8 +1830,15 @@ def main():
|
|
1744 |
st.session_state.selected_files = selected_indices
|
1745 |
|
1746 |
if selected_indices:
|
1747 |
-
|
1748 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1749 |
|
1750 |
if st.button("Download Selected", key="download_btn"):
|
1751 |
if not os.path.exists(download_dir):
|
@@ -1777,8 +1870,7 @@ def main():
|
|
1777 |
if downloaded:
|
1778 |
st.success(f"Successfully downloaded {len(downloaded)} files")
|
1779 |
|
1780 |
-
|
1781 |
-
if create_zip and len(downloaded) > 1:
|
1782 |
zip_path = create_zip_file(downloaded, download_dir)
|
1783 |
st.success(f"Created ZIP file: {zip_path}")
|
1784 |
|
@@ -1786,14 +1878,32 @@ def main():
|
|
1786 |
with open(zip_path, "rb") as f:
|
1787 |
zip_data = f.read()
|
1788 |
|
1789 |
-
zip_filename = os.path.basename(zip_path)
|
1790 |
st.download_button(
|
1791 |
label="Download ZIP",
|
1792 |
data=zip_data,
|
1793 |
-
file_name=
|
1794 |
mime="application/zip",
|
1795 |
key="download_zip_btn"
|
1796 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1797 |
else:
|
1798 |
# Provide individual file downloads
|
1799 |
st.write("Download files individually:")
|
|
|
25 |
from PIL import Image
|
26 |
from reportlab.lib.pagesizes import letter
|
27 |
from reportlab.pdfgen import canvas
|
28 |
+
import google_auth_oauthlib.flow
|
29 |
+
import googleapiclient.discovery
|
30 |
+
import google.auth.transport.requests
|
31 |
+
import googleapiclient.http
|
32 |
|
33 |
# -------------------- Logging Setup --------------------
|
34 |
logging.basicConfig(
|
|
|
37 |
)
|
38 |
logger = logging.getLogger(__name__)
|
39 |
|
40 |
+
# -------------------- Google OAuth Config --------------------
|
41 |
+
GOOGLE_OAUTH_CONFIG = {
|
42 |
+
"web": {
|
43 |
+
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
|
44 |
+
"project_id": "huggingface-449214",
|
45 |
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
46 |
+
"token_uri": "https://oauth2.googleapis.com/token",
|
47 |
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
48 |
+
"client_secret": "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",
|
49 |
+
"redirect_uris": ["https://euler314-craw-web.hf.space/"]
|
50 |
+
}
|
51 |
+
}
|
52 |
+
|
53 |
# -------------------- Utility Functions --------------------
|
54 |
def get_random_user_agent():
|
55 |
USER_AGENTS = [
|
|
|
75 |
zipf.write(file_path, os.path.basename(file_path))
|
76 |
return zip_path
|
77 |
|
78 |
+
# -------------------- Google Drive Functions --------------------
|
79 |
+
def get_google_auth_url():
|
80 |
+
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
81 |
+
flow = google_auth_oauthlib.flow.Flow.from_client_config(
|
82 |
+
{"web": client_config},
|
83 |
+
scopes=["https://www.googleapis.com/auth/drive.file"]
|
84 |
+
)
|
85 |
+
flow.redirect_uri = client_config["redirect_uris"][0]
|
86 |
+
authorization_url, _ = flow.authorization_url(
|
87 |
+
access_type="offline",
|
88 |
+
include_granted_scopes="true",
|
89 |
+
prompt="consent"
|
90 |
+
)
|
91 |
+
return authorization_url
|
92 |
+
|
93 |
+
def exchange_code_for_credentials(auth_code):
|
94 |
+
if not auth_code.strip():
|
95 |
+
return None, "No code provided."
|
96 |
+
try:
|
97 |
+
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
98 |
+
flow = google_auth_oauthlib.flow.Flow.from_client_config(
|
99 |
+
{"web": client_config},
|
100 |
+
scopes=["https://www.googleapis.com/auth/drive.file"]
|
101 |
+
)
|
102 |
+
flow.redirect_uri = client_config["redirect_uris"][0]
|
103 |
+
flow.fetch_token(code=auth_code.strip())
|
104 |
+
creds = flow.credentials
|
105 |
+
if not creds or not creds.valid:
|
106 |
+
return None, "Could not validate credentials. Check code and try again."
|
107 |
+
return creds, "Google Sign-In successful!"
|
108 |
+
except Exception as e:
|
109 |
+
return None, f"Error during token exchange: {e}"
|
110 |
+
|
111 |
+
def google_drive_upload(file_path, credentials, folder_id=None):
|
112 |
+
try:
|
113 |
+
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
|
114 |
+
file_metadata = {'name': os.path.basename(file_path)}
|
115 |
+
if folder_id:
|
116 |
+
file_metadata['parents'] = [folder_id]
|
117 |
+
media = googleapiclient.http.MediaFileUpload(file_path, resumable=True)
|
118 |
+
created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
119 |
+
return created.get("id", "")
|
120 |
+
except Exception as e:
|
121 |
+
return f"Error uploading to Drive: {str(e)}"
|
122 |
+
|
123 |
+
def create_drive_folder(drive_service, name):
|
124 |
+
folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
|
125 |
+
folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
|
126 |
+
return folder.get('id')
|
127 |
+
|
128 |
# -------------------- Playwright Setup --------------------
|
129 |
def install_playwright_dependencies():
|
130 |
try:
|
|
|
132 |
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
|
133 |
|
134 |
# Install system dependencies
|
135 |
+
subprocess.run(['apt-get', 'update', '-y'], check=True)
|
136 |
+
packages = [
|
137 |
+
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
|
138 |
+
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
|
139 |
+
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
|
140 |
+
]
|
141 |
+
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
|
142 |
+
|
143 |
+
# Install Playwright and dependencies
|
144 |
subprocess.run(['pip', 'install', 'playwright'], check=True)
|
145 |
+
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
|
|
|
146 |
|
147 |
st.success("Playwright dependencies installed successfully!")
|
148 |
except Exception as e:
|
|
|
1739 |
st.session_state.initialized = True
|
1740 |
st.session_state.discovered_files = []
|
1741 |
st.session_state.current_url = None
|
1742 |
+
st.session_state.google_creds = None
|
1743 |
st.session_state.selected_files = []
|
1744 |
st.session_state.do_deep_search = False
|
1745 |
st.session_state.deep_search_url = None
|
|
|
1754 |
use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
|
1755 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
|
1756 |
|
1757 |
+
with st.expander("Google Drive Integration", expanded=False):
|
1758 |
+
if st.button("Start Google Sign-In", key="google_signin_btn"):
|
1759 |
+
auth_url = get_google_auth_url()
|
1760 |
+
st.markdown(f"[Click here to authorize]({auth_url})")
|
1761 |
+
auth_code = st.text_input("Enter authorization code", key="auth_code_input")
|
1762 |
+
if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
|
1763 |
+
creds, msg = exchange_code_for_credentials(auth_code)
|
1764 |
+
st.session_state.google_creds = creds
|
1765 |
+
st.write(msg)
|
1766 |
+
|
1767 |
if mode == "Manual URL":
|
1768 |
st.header("Manual URL Mode")
|
1769 |
url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
|
|
|
1830 |
st.session_state.selected_files = selected_indices
|
1831 |
|
1832 |
if selected_indices:
|
1833 |
+
col1, col2, col3, col4 = st.columns(4)
|
1834 |
+
with col1:
|
1835 |
+
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
|
1836 |
+
with col2:
|
1837 |
+
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
|
1838 |
+
with col3:
|
1839 |
+
delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
|
1840 |
+
with col4:
|
1841 |
+
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
|
1842 |
|
1843 |
if st.button("Download Selected", key="download_btn"):
|
1844 |
if not os.path.exists(download_dir):
|
|
|
1870 |
if downloaded:
|
1871 |
st.success(f"Successfully downloaded {len(downloaded)} files")
|
1872 |
|
1873 |
+
if create_zip:
|
|
|
1874 |
zip_path = create_zip_file(downloaded, download_dir)
|
1875 |
st.success(f"Created ZIP file: {zip_path}")
|
1876 |
|
|
|
1878 |
with open(zip_path, "rb") as f:
|
1879 |
zip_data = f.read()
|
1880 |
|
|
|
1881 |
st.download_button(
|
1882 |
label="Download ZIP",
|
1883 |
data=zip_data,
|
1884 |
+
file_name=os.path.basename(zip_path),
|
1885 |
mime="application/zip",
|
1886 |
key="download_zip_btn"
|
1887 |
)
|
1888 |
+
|
1889 |
+
# Upload to Google Drive if requested
|
1890 |
+
if upload_to_drive and st.session_state.google_creds:
|
1891 |
+
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_creds)
|
1892 |
+
folder_id = create_drive_folder(drive_service, f"Downloads_{urlparse(url).netloc}")
|
1893 |
+
drive_id = google_drive_upload(zip_path, st.session_state.google_creds, folder_id)
|
1894 |
+
if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
|
1895 |
+
st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
|
1896 |
+
else:
|
1897 |
+
st.error(drive_id)
|
1898 |
+
|
1899 |
+
# Delete original files if requested
|
1900 |
+
if delete_after:
|
1901 |
+
for path in downloaded:
|
1902 |
+
try:
|
1903 |
+
os.remove(path)
|
1904 |
+
except Exception as e:
|
1905 |
+
st.warning(f"Could not delete {path}: {e}")
|
1906 |
+
st.info("Deleted original files after ZIP creation")
|
1907 |
else:
|
1908 |
# Provide individual file downloads
|
1909 |
st.write("Download files individually:")
|