import subprocess import json import os import requests # 参数配置 base_url = "https://datasets-server.huggingface.co/rows" dataset_path = "cat-state/mscoco-1st-caption" config = "default" split = "train" offset = 0 length = 100 total_data = 1000 # 目标获取的数据总量 iterations = total_data // length # 需要循环的次数 image_dir = "../images_large" if not os.path.exists(image_dir): os.makedirs(image_dir) text_data = {} # 循环多次,以获取全部数据 for i in range(iterations): # 构建请求 URL url = f"{base_url}?dataset={dataset_path}&config={config}&split={split}&offset={offset}&length={length}" # 获取数据 result = subprocess.run( ["curl", "-X", "GET", url], capture_output=True, text=True ) output = result.stdout try: data_dict = json.loads(output) except json.JSONDecodeError: print(f"无法将输出转换为字典。输出内容: {output}") continue if 'rows' in data_dict: for item in data_dict['rows']: row_idx = item['row_idx'] row = item['row'] image_url = row.get('url') text = row.get('caption') if image_url: image_filename = f"{image_dir}/{row_idx}_row_image.jpg" response = requests.get(image_url, stream=True) if response.status_code == 200: with open(image_filename, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) text_data[f"{row_idx}_row_image"] = text offset += length # 更新偏移量以获取下一批数据 # 保存文本数据 json_filename = "../data/row_image_texts_large.json" with open(json_filename, 'w') as f: json.dump(text_data, f, indent=4) print("图像下载并保存完成,文本信息已保存到 row_image_texts.json")