Spaces:

rapacious
/

GetDataSet

Running

File size: 5,177 Bytes

import requests
from bs4 import BeautifulSoup
import os
import urllib.request
import json
import time
import random
import gradio as gr
import shutil
import threading

# Hàm lấy gợi ý tìm kiếm từ Google
def get_google_suggestions(query):
    url = f"http://suggestqueries.google.com/complete/search?client=firefox&q={query}"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
    try:
        response = requests.get(url, headers=headers, timeout=5)
        return json.loads(response.text)[1]
    except:
        return []

# Hàm tải ảnh
def download_images(search_term, num_images_per_term, save_folder, status_callback):
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    
    search_url = f"https://www.google.com/search?q={search_term}+free&tbm=isch"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
    
    try:
        response = requests.get(search_url, headers=headers, timeout=10)
        response.raise_for_status()
    except:
        status_callback(f"Lỗi truy cập {search_term}")
        return 0

    soup = BeautifulSoup(response.text, 'html.parser')
    img_tags = soup.find_all('img')
    
    count = 0
    for i, img in enumerate(img_tags):
        if count >= num_images_per_term:
            break
        try:
            img_url = img.get('src')
            if img_url and img_url.startswith('http'):
                file_name = f"{save_folder}/{search_term.replace(' ', '_')}_{count}.jpg"
                urllib.request.urlretrieve(img_url, file_name)
                status_callback(f"Đã tải: {file_name}")
                count += 1
                time.sleep(random.uniform(1, 3))
        except Exception as e:
            status_callback(f"Lỗi ảnh {i} ({search_term}): {str(e)}")
            continue
    
    return count

# Hàm nén thư mục
def zip_folder(folder_path):
    output_zip = os.path.join(os.path.dirname(folder_path), "downloaded_images")
    try:
        shutil.make_archive(output_zip, 'zip', folder_path)
        return f"Đã nén thành công: {output_zip}.zip"
    except Exception as e:
        return f"Lỗi khi nén: {str(e)}"

# Hàm chính tải ảnh
def start_download(initial_query, target_images, max_per_term, save_folder):
    status_log = []
    
    def status_callback(message):
        status_log.append(message)
    
    total_downloaded = 0
    current_query = initial_query
    used_queries = set()
    
    while total_downloaded < target_images:
        suggestions = get_google_suggestions(current_query)
        if not suggestions:
            status_callback("Hết gợi ý, dừng lại.")
            break
        
        available_suggestions = [s for s in suggestions if s not in used_queries]
        if not available_suggestions:
            status_callback("Hết gợi ý mới, dừng lại.")
            break
        
        current_query = random.choice(available_suggestions)
        used_queries.add(current_query)
        
        remaining = target_images - total_downloaded
        images_to_download = min(max_per_term, remaining)
        
        status_callback(f"Tìm kiếm: {current_query}")
        downloaded = download_images(current_query, images_to_download, save_folder, status_callback)
        total_downloaded += downloaded
        
        status_callback(f"Tổng: {total_downloaded}/{target_images}")
        time.sleep(random.uniform(2, 5))
    
    status_callback(f"Hoàn tất! Đã tải {total_downloaded} ảnh.")
    status_callback("Đang nén thư mục...")
    zip_result = zip_folder(save_folder)
    status_callback(zip_result)
    
    return "\n".join(status_log)

# Giao diện Gradio
def create_interface():
    with gr.Blocks(title="Image Downloader") as demo:
        gr.Markdown("# Image Downloader")
        gr.Markdown("Tải ảnh từ Google Images và nén thành file zip.")
        
        with gr.Row():
            with gr.Column():
                initial_query = gr.Textbox(label="Từ khóa ban đầu", value="free images")
                target_images = gr.Number(label="Số lượng ảnh cần tải", value=10000, precision=0)
                max_per_term = gr.Number(label="Số ảnh tối đa mỗi từ khóa", value=20, precision=0)
                save_folder = gr.Textbox(label="Thư mục lưu", value="free_images")
            
            with gr.Column():
                output = gr.Textbox(label="Trạng thái", lines=20, interactive=False)
                submit_btn = gr.Button("Bắt đầu tải")
        
        def run_download(query, target, max_term, folder):
            return start_download(query, int(target), int(max_term), folder)
        
        submit_btn.click(
            fn=run_download,
            inputs=[initial_query, target_images, max_per_term, save_folder],
            outputs=output
        )
    
    return demo

# Chạy ứng dụng
if __name__ == "__main__":
    interface = create_interface()
    interface.launch()