File size: 1,990 Bytes
a501a0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import subprocess
import json
import os
import requests
# 参数配置
base_url = "https://datasets-server.huggingface.co/rows"
dataset_path = "cat-state/mscoco-1st-caption"
config = "default"
split = "train"
offset = 0
length = 100
total_data = 1000 # 目标获取的数据总量
iterations = total_data // length # 需要循环的次数
image_dir = "../images_large"
if not os.path.exists(image_dir):
os.makedirs(image_dir)
text_data = {}
# 循环多次,以获取全部数据
for i in range(iterations):
# 构建请求 URL
url = f"{base_url}?dataset={dataset_path}&config={config}&split={split}&offset={offset}&length={length}"
# 获取数据
result = subprocess.run(
["curl", "-X", "GET", url],
capture_output=True,
text=True
)
output = result.stdout
try:
data_dict = json.loads(output)
except json.JSONDecodeError:
print(f"无法将输出转换为字典。输出内容: {output}")
continue
if 'rows' in data_dict:
for item in data_dict['rows']:
row_idx = item['row_idx']
row = item['row']
image_url = row.get('url')
text = row.get('caption')
if image_url:
image_filename = f"{image_dir}/{row_idx}_row_image.jpg"
response = requests.get(image_url, stream=True)
if response.status_code == 200:
with open(image_filename, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
text_data[f"{row_idx}_row_image"] = text
offset += length # 更新偏移量以获取下一批数据
# 保存文本数据
json_filename = "../data/row_image_texts_large.json"
with open(json_filename, 'w') as f:
json.dump(text_data, f, indent=4)
print("图像下载并保存完成,文本信息已保存到 row_image_texts.json")
|