starriver030515
/

diffusion

Model card Files Files and versions Community

File size: 1,990 Bytes

a501a0c

import subprocess
import json
import os
import requests

# 参数配置
base_url = "https://datasets-server.huggingface.co/rows"
dataset_path = "cat-state/mscoco-1st-caption"
config = "default"
split = "train"
offset = 0
length = 100
total_data = 1000  # 目标获取的数据总量
iterations = total_data // length  # 需要循环的次数

image_dir = "../images_large"
if not os.path.exists(image_dir):
    os.makedirs(image_dir)

text_data = {}

# 循环多次，以获取全部数据
for i in range(iterations):
    # 构建请求 URL
    url = f"{base_url}?dataset={dataset_path}&config={config}&split={split}&offset={offset}&length={length}"
    
    # 获取数据
    result = subprocess.run(
        ["curl", "-X", "GET", url],
        capture_output=True,
        text=True
    )
    
    output = result.stdout
    
    try:
        data_dict = json.loads(output)
    except json.JSONDecodeError:
        print(f"无法将输出转换为字典。输出内容: {output}")
        continue
    
    if 'rows' in data_dict:
        for item in data_dict['rows']:
            row_idx = item['row_idx']
            row = item['row']
            image_url = row.get('url')
            text = row.get('caption')
            
            if image_url:
                image_filename = f"{image_dir}/{row_idx}_row_image.jpg"
                response = requests.get(image_url, stream=True)
                if response.status_code == 200:
                    with open(image_filename, 'wb') as f:
                        for chunk in response.iter_content(chunk_size=8192):
                            f.write(chunk)
            
            text_data[f"{row_idx}_row_image"] = text
    
    offset += length  # 更新偏移量以获取下一批数据

# 保存文本数据
json_filename = "../data/row_image_texts_large.json"
with open(json_filename, 'w') as f:
    json.dump(text_data, f, indent=4)

print("图像下载并保存完成，文本信息已保存到 row_image_texts.json")