GetDataSet / app.py
rapacious's picture
Update app.py
be835ae verified
import requests
from bs4 import BeautifulSoup
import os
import urllib.request
import json
import time
import random
import gradio as gr
import shutil
# Các hàm phụ trợ giữ nguyên
def get_google_suggestions(query):
url = f"http://suggestqueries.google.com/complete/search?client=firefox&q={query}"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
try:
response = requests.get(url, headers=headers, timeout=5)
return json.loads(response.text)[1]
except:
return []
def download_images(search_term, num_images_per_term, save_folder, status_callback):
if not os.path.exists(save_folder):
os.makedirs(save_folder)
search_url = f"https://www.google.com/search?q={search_term}+free&tbm=isch"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
try:
response = requests.get(search_url, headers=headers, timeout=10)
response.raise_for_status()
except:
status_callback(f"Lỗi truy cập {search_term}")
return 0
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
count = 0
for i, img in enumerate(img_tags):
if count >= num_images_per_term:
break
try:
img_url = img.get('src')
if img_url and img_url.startswith('http'):
file_name = f"{save_folder}/{search_term.replace(' ', '_')}_{count}.jpg"
urllib.request.urlretrieve(img_url, file_name)
status_callback(f"Đã tải: {file_name}")
count += 1
time.sleep(random.uniform(1, 3))
except Exception as e:
status_callback(f"Lỗi ảnh {i} ({search_term}): {str(e)}")
continue
return count
def zip_folder(folder_path):
output_zip = os.path.join(os.path.dirname(folder_path), "downloaded_images")
try:
shutil.make_archive(output_zip, 'zip', folder_path)
return output_zip + ".zip", "Đã nén thành công"
except Exception as e:
return None, f"Lỗi khi nén: {str(e)}"
def start_download(initial_query, target_images, max_per_term, save_folder, zip_files):
status_log = []
def status_callback(message):
status_log.append(message)
total_downloaded = 0
current_query = initial_query
used_queries = set()
while total_downloaded < target_images:
suggestions = get_google_suggestions(current_query)
if not suggestions:
status_callback("Hết gợi ý, dừng lại.")
break
available_suggestions = [s for s in suggestions if s not in used_queries]
if not available_suggestions:
status_callback("Hết gợi ý mới, dừng lại.")
break
current_query = random.choice(available_suggestions)
used_queries.add(current_query)
remaining = target_images - total_downloaded
images_to_download = min(max_per_term, remaining)
status_callback(f"Tìm kiếm: {current_query}")
downloaded = download_images(current_query, images_to_download, save_folder, status_callback)
total_downloaded += downloaded
status_callback(f"Tổng: {total_downloaded}/{target_images}")
time.sleep(random.uniform(2, 5))
status_callback(f"Hoàn tất! Đã tải {total_downloaded} ảnh.")
zip_file_path = None
if zip_files:
status_callback("Đang nén thư mục...")
zip_file_path, zip_message = zip_folder(save_folder)
status_callback(zip_message)
return "\n".join(status_log), zip_file_path
# Giao diện Gradio
def create_interface():
css = """
body {
background: #2b2b2b;
font-family: 'Segoe UI', sans-serif;
}
.container {
max-width: 1000px;
margin: 20px auto;
background: #36393f;
padding: 25px;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.3);
}
h1 {
color: #7289da;
text-align: center;
margin-bottom: 15px;
}
.description {
color: #b9bbbe;
text-align: center;
font-size: 14px;
margin-bottom: 20px;
}
.input-group {
background: #40444b;
padding: 15px;
border-radius: 8px;
margin-bottom: 15px;
}
.status-box {
background: #2f3136;
border: 1px solid #202225;
border-radius: 8px;
padding: 15px;
height: 350px;
color: #dcddde;
font-family: 'Courier New', monospace;
font-size: 14px;
}
.button-primary {
background: #7289da !important;
color: white !important;
border: none !important;
border-radius: 5px !important;
padding: 10px 20px !important;
font-weight: bold !important;
transition: all 0.3s !important;
}
.button-primary:hover {
background: #677bc4 !important;
}
.button-secondary {
background: #4f545c !important;
color: #dcddde !important;
border-radius: 5px !important;
padding: 8px 15px !important;
}
.button-secondary:hover {
background: #5c6169 !important;
}
.footer {
text-align: center;
color: #72767d;
font-size: 12px;
margin-top: 20px;
}
"""
with gr.Blocks(css=css, title="Image Downloader Pro") as demo:
gr.Markdown("<h1>📷 Image Downloader Pro</h1>")
gr.Markdown("<p class='description'>Tải ảnh từ Google Images với giao diện tối giản và thân thiện</p>")
with gr.Row(elem_classes="container"):
with gr.Column(scale=1, min_width=300):
with gr.Group(elem_classes="input-group"):
gr.Markdown("#### Cài đặt tải")
initial_query = gr.Textbox(
label="Từ khóa ban đầu",
value="free images",
placeholder="Nhập từ khóa tìm kiếm..."
)
target_images = gr.Slider(
label="Số lượng ảnh",
minimum=1,
maximum=10000,
value=100,
step=1
)
max_per_term = gr.Slider(
label="Ảnh tối đa mỗi từ khóa",
minimum=1,
maximum=50,
value=20,
step=1
)
save_folder = gr.Textbox(
label="Thư mục lưu",
value=os.path.join(os.getcwd(), "free_images"),
placeholder="Đường dẫn thư mục..."
)
zip_files = gr.Checkbox(label="Nén thành ZIP sau khi tải", value=True)
submit_btn = gr.Button("Bắt đầu tải", elem_classes="button-primary")
with gr.Column(scale=2):
with gr.Group(elem_classes="input-group"):
gr.Markdown("#### Trạng thái")
output_status = gr.Textbox(
label="Nhật ký tải",
lines=15,
interactive=False,
elem_classes="status-box"
)
output_file = gr.File(label="File ZIP (nếu có)", visible=False)
gr.Markdown("<p class='footer'>Powered by Gradio & xAI</p>")
# Xử lý tải
def run_download(query, target, max_term, folder, zip_opt):
if not folder:
return "Vui lòng nhập thư mục lưu!", gr.File.update(visible=False)
status, zip_path = start_download(query, int(target), int(max_term), folder, zip_opt)
if zip_path:
return status, gr.File.update(value=zip_path, visible=True)
return status, gr.File.update(visible=False)
submit_btn.click(
fn=run_download,
inputs=[initial_query, target_images, max_per_term, save_folder, zip_files],
outputs=[output_status, output_file]
)
return demo
if __name__ == "__main__":
interface = create_interface()
interface.launch()