Spaces:
Running
Running
File size: 927 Bytes
b10121d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
import json
import random
DATASET_PATH = "sharegpt4video_40k.jsonl"
VIDEO_SHARD_NAME = "panda_videos_1.zip"
NUM_SAMPLES = 100
SEED = 1
def main() -> None:
dataset = [json.loads(line) for line in open(DATASET_PATH) if VIDEO_SHARD_NAME in line]
random.seed(SEED)
random.shuffle(dataset)
sampled = dict(caption=[], video_id=[])
for sample in dataset[:NUM_SAMPLES]:
assert sample["zip_folder"] == VIDEO_SHARD_NAME, f"sample from wrong video shard: {sample}"
whole_video_caption = next(
(c for c in sample["captions"] if c["idx"] == "-1"), None
)
assert whole_video_caption is not None, f"whole video caption not found for sample: {sample}"
sampled["caption"].append(whole_video_caption["content"])
sampled["video_id"].append(sample["video_id"])
json.dump(sampled, open("sharegpt4video_100.json", "w"))
if __name__ == "__main__":
main()
|