diffusion / utils /expand_dataset.py
starriver030515's picture
Upload folder using huggingface_hub
a501a0c verified
import os
import shutil
import json
import random
# prompt in llava
prompt_for_image = [
"Describe the image concisely.",
"Provide a brief description of the given image.",
"Offer a succinct explanation of the picture presented.",
"Summarize the visual content of the image."
"Give a short and clear explanation of the subsequent image.",
"Share a concise interpretation of the image provided.",
"Present a compact description of the photo's key features.",
"Relay a brief, clear account of the picture shown.",
"Render a clear and concise summary of the photo.",
"Write a terse but informative summary of the picture.",
"Create a compact narrative representing the image presented.",
]
# path to the generated images
source_folder = "/mnt/petrelfs/zhuchenglin/diffusion/images_large1"
# path to the llava training images, which has the initial 660 subfolders
target_folder = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/images"
# path to the llava training annotations folder
target_anno_folder = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain"
# path to the COCO annotations file
annotations_coco_path = (
"/mnt/petrelfs/zhuchenglin/diffusion/coco/annotations/captions_train2017.json"
)
with open(annotations_coco_path, "r") as f:
annotations = json.load(f)
new_annotations = []
for index, annotation in enumerate(annotations["annotations"][:500000]):
print(index)
# 660 is the starting index
folder_index = 900 + (index // 10000)
target_subfolder = f"{folder_index:05d}"
# format of the image name: 00000xxxx.jpg
target_image_name = f"{folder_index:05d}{index % 10000:04d}.jpg"
target_image_path = os.path.join(target_folder, target_subfolder, target_image_name)
if not os.path.exists(os.path.join(target_folder, target_subfolder)):
os.makedirs(os.path.join(target_folder, target_subfolder))
# the default name of generated images is index.jpg
source_image_path = os.path.join(source_folder, f"{index}.jpg")
if os.path.exists(source_image_path):
shutil.copy(source_image_path, target_image_path)
random_prompt = random.choice(prompt_for_image)
new_annotation = {
"id": f"{target_subfolder}{index % 10000:04d}",
"image": f"{target_subfolder}/{target_image_name}",
"conversations": [
{"from": "human", "value": f"{random_prompt}\n<image>"},
{"from": "gpt", "value": annotation["caption"]},
],
}
new_annotations.append(new_annotation)
json_file_path = os.path.join(target_anno_folder, "coco_annotations_500k.json")
with open(json_file_path, "w") as json_file:
json.dump(new_annotations, json_file, indent=4)