|
import subprocess |
|
import json |
|
import os |
|
import requests |
|
|
|
|
|
base_url = "https://datasets-server.huggingface.co/rows" |
|
dataset_path = "cat-state/mscoco-1st-caption" |
|
config = "default" |
|
split = "train" |
|
offset = 0 |
|
length = 100 |
|
total_data = 1000 |
|
iterations = total_data // length |
|
|
|
image_dir = "../images_large" |
|
if not os.path.exists(image_dir): |
|
os.makedirs(image_dir) |
|
|
|
text_data = {} |
|
|
|
|
|
for i in range(iterations): |
|
|
|
url = f"{base_url}?dataset={dataset_path}&config={config}&split={split}&offset={offset}&length={length}" |
|
|
|
|
|
result = subprocess.run( |
|
["curl", "-X", "GET", url], |
|
capture_output=True, |
|
text=True |
|
) |
|
|
|
output = result.stdout |
|
|
|
try: |
|
data_dict = json.loads(output) |
|
except json.JSONDecodeError: |
|
print(f"无法将输出转换为字典。输出内容: {output}") |
|
continue |
|
|
|
if 'rows' in data_dict: |
|
for item in data_dict['rows']: |
|
row_idx = item['row_idx'] |
|
row = item['row'] |
|
image_url = row.get('url') |
|
text = row.get('caption') |
|
|
|
if image_url: |
|
image_filename = f"{image_dir}/{row_idx}_row_image.jpg" |
|
response = requests.get(image_url, stream=True) |
|
if response.status_code == 200: |
|
with open(image_filename, 'wb') as f: |
|
for chunk in response.iter_content(chunk_size=8192): |
|
f.write(chunk) |
|
|
|
text_data[f"{row_idx}_row_image"] = text |
|
|
|
offset += length |
|
|
|
|
|
json_filename = "../data/row_image_texts_large.json" |
|
with open(json_filename, 'w') as f: |
|
json.dump(text_data, f, indent=4) |
|
|
|
print("图像下载并保存完成,文本信息已保存到 row_image_texts.json") |
|
|
|
|