Spaces:
Running
on
Zero
Running
on
Zero
import argparse | |
import json | |
import pathlib | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--path", | |
type=str, | |
required=True, | |
help="Path to folder with image-text pairs.", | |
) | |
parser.add_argument("--caption_column", type=str, default="prompt", help="Name of caption column.") | |
args = parser.parse_args() | |
path = pathlib.Path(args.path) | |
if not path.exists(): | |
raise RuntimeError(f"`--path` '{args.path}' does not exist.") | |
all_files = list(path.glob("*")) | |
captions = list(path.glob("*.txt")) | |
images = set(all_files) - set(captions) | |
images = {image.stem: image for image in images} | |
caption_image = {caption: images.get(caption.stem) for caption in captions if images.get(caption.stem)} | |
metadata = path.joinpath("metadata.jsonl") | |
with metadata.open("w", encoding="utf-8") as f: | |
for caption, image in caption_image.items(): | |
caption_text = caption.read_text(encoding="utf-8") | |
json.dump({"file_name": image.name, args.caption_column: caption_text}, f) | |
f.write("\n") | |