File size: 927 Bytes
b10121d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import json
import random

DATASET_PATH = "sharegpt4video_40k.jsonl"
VIDEO_SHARD_NAME = "panda_videos_1.zip"
NUM_SAMPLES = 100
SEED = 1


def main() -> None:
    dataset = [json.loads(line) for line in open(DATASET_PATH) if VIDEO_SHARD_NAME in line]
    random.seed(SEED)
    random.shuffle(dataset)

    sampled = dict(caption=[], video_id=[])
    for sample in dataset[:NUM_SAMPLES]:
        assert sample["zip_folder"] == VIDEO_SHARD_NAME, f"sample from wrong video shard: {sample}"
        whole_video_caption = next(
            (c for c in sample["captions"] if c["idx"] == "-1"), None
        )
        assert whole_video_caption is not None, f"whole video caption not found for sample: {sample}"
        sampled["caption"].append(whole_video_caption["content"])
        sampled["video_id"].append(sample["video_id"])

    json.dump(sampled, open("sharegpt4video_100.json", "w"))


if __name__ == "__main__":
    main()