|
import os |
|
import shutil |
|
import json |
|
import random |
|
|
|
|
|
prompt_for_image = [ |
|
"Describe the image concisely.", |
|
"Provide a brief description of the given image.", |
|
"Offer a succinct explanation of the picture presented.", |
|
"Summarize the visual content of the image." |
|
"Give a short and clear explanation of the subsequent image.", |
|
"Share a concise interpretation of the image provided.", |
|
"Present a compact description of the photo's key features.", |
|
"Relay a brief, clear account of the picture shown.", |
|
"Render a clear and concise summary of the photo.", |
|
"Write a terse but informative summary of the picture.", |
|
"Create a compact narrative representing the image presented.", |
|
] |
|
|
|
|
|
source_folder = "/mnt/petrelfs/zhuchenglin/diffusion/images_large1" |
|
|
|
target_folder = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/images" |
|
|
|
target_anno_folder = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain" |
|
|
|
annotations_coco_path = ( |
|
"/mnt/petrelfs/zhuchenglin/diffusion/coco/annotations/captions_train2017.json" |
|
) |
|
with open(annotations_coco_path, "r") as f: |
|
annotations = json.load(f) |
|
|
|
new_annotations = [] |
|
for index, annotation in enumerate(annotations["annotations"][:500000]): |
|
print(index) |
|
|
|
folder_index = 900 + (index // 10000) |
|
target_subfolder = f"{folder_index:05d}" |
|
|
|
|
|
target_image_name = f"{folder_index:05d}{index % 10000:04d}.jpg" |
|
target_image_path = os.path.join(target_folder, target_subfolder, target_image_name) |
|
if not os.path.exists(os.path.join(target_folder, target_subfolder)): |
|
os.makedirs(os.path.join(target_folder, target_subfolder)) |
|
|
|
|
|
source_image_path = os.path.join(source_folder, f"{index}.jpg") |
|
if os.path.exists(source_image_path): |
|
shutil.copy(source_image_path, target_image_path) |
|
|
|
random_prompt = random.choice(prompt_for_image) |
|
new_annotation = { |
|
"id": f"{target_subfolder}{index % 10000:04d}", |
|
"image": f"{target_subfolder}/{target_image_name}", |
|
"conversations": [ |
|
{"from": "human", "value": f"{random_prompt}\n<image>"}, |
|
{"from": "gpt", "value": annotation["caption"]}, |
|
], |
|
} |
|
new_annotations.append(new_annotation) |
|
|
|
json_file_path = os.path.join(target_anno_folder, "coco_annotations_500k.json") |
|
with open(json_file_path, "w") as json_file: |
|
json.dump(new_annotations, json_file, indent=4) |
|
|