# coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Copyright (c) Facebook, Inc. All Rights Reserved import torch import math import pickle import random import os import numpy as np from collections import deque from typing import Optional, Tuple, List from .processor import ( Processor, MetaProcessor, TextProcessor, Aligner, MMAttentionMask2DProcessor ) from ..utils import ShardedTensor class How2MetaProcessor(MetaProcessor): def __init__(self, config): super().__init__(config) path = self._get_split_path(config) with open(path) as fd: self.data = [line.strip() for line in fd] def __getitem__(self, idx): video_id = self.data[idx] return video_id, video_id class ShardedHow2MetaProcessor(How2MetaProcessor): def __init__(self, config): super().__init__(config) self.split = str(config.split) self.vfeat_dir = config.vfeat_dir self._init_shard() def _init_shard(self): if self.split == "train": meta_fn = os.path.join(self.vfeat_dir, "train" + "_meta.pkl") with open(meta_fn, "rb") as fr: meta = pickle.load(fr) elif self.split == "valid": meta_fn = os.path.join(self.vfeat_dir, "val" + "_meta.pkl") with open(meta_fn, "rb") as fr: meta = pickle.load(fr) elif self.split == "test": print("use how2 val as test.") meta_fn = os.path.join(self.vfeat_dir, "val" + "_meta.pkl") with open(meta_fn, "rb") as fr: meta = pickle.load(fr) else: raise ValueError("unsupported for MetaProcessor:", self.split) video_id_to_shard = {} for shard_id in meta: for video_idx, video_id in enumerate(meta[shard_id]): video_id_to_shard[video_id] = (shard_id, video_idx) self.video_id_to_shard = video_id_to_shard def __getitem__(self, idx): video_id, video_id = super().__getitem__(idx) shard_id, shard_idx = self.video_id_to_shard[video_id] meta = (video_id, idx, shard_id, shard_idx) return meta, meta class ShardedVideoProcessor(Processor): """ mmaped shards of numpy video features. """ def __init__(self, config): self.split = str(config.split) self.vfeat_dir = config.vfeat_dir def __call__(self, video_id): _, _, shard_id, video_idx = video_id if self.split == "train": shard = ShardedTensor.load( os.path.join(self.vfeat_dir, "train" + "_" + str(shard_id)), "r" ) elif self.split == "valid": shard = ShardedTensor.load( os.path.join(self.vfeat_dir, "val" + "_" + str(shard_id)), "r" ) elif self.split == "test": shard = ShardedTensor.load( os.path.join(self.vfeat_dir, "val" + "_" + str(shard_id)), "r" ) else: raise ValueError("unknown split", self.split) feat = shard[video_idx] return feat class ShardedTextProcessor(Processor): def __init__(self, config): self.tfeat_dir = str(config.tfeat_dir) self.split = str(config.split) def __call__(self, video_id): _, _, shard_id, shard_idx = video_id if self.split == "train": target_path = self.tfeat_dir + "train" + "_" + str(shard_id) elif self.split == "valid": target_path = self.tfeat_dir + "val" + "_" + str(shard_id) elif self.split == "test": target_path = self.tfeat_dir + "val" + "_" + str(shard_id) else: raise ValueError("unknown split", self.split) startend = ShardedTensor.load( target_path + ".startends", "r")[shard_idx] cap_ids = ShardedTensor.load( target_path + ".caps_ids", "r")[shard_idx] cap = [] for clip_idx in range(len(cap_ids)): clip = cap_ids[clip_idx] cap.append(clip[clip != -1].tolist()) start, end = startend[:, 0].tolist(), startend[:, 1].tolist() return {"start": start, "end": end, "cap": cap} class FixedLenAligner(Aligner): """ In the model we assume text is on the left (closer to BERT formulation) and video is on the right. We fix the total length of text + video. max_video_len is in number of secs. max_text_len is in number of tokens. special tokens formats: we use the format [CLS] [SEP] text tokens [SEP] [PAD] ... [CLS] will be splitted out into: [CLS] video tokens [SEP] text tokens [SEP] [PAD] ... token_type_ids will be generated by the model (for now). 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | so each sequence owns a [SEP] token for no-ops. """ def __init__(self, config): super().__init__(config) self.text_clip_sampler = TextClipSamplingProcessor( self.max_len - self.max_video_len - 3 ) """ decide subsampling: `config.subsampling` will change batch_size in trainer. `config.clip_per_video` (used by RetriTask) doesn't change batch_size in trainer. """ subsampling = config.subsampling \ if config.subsampling is not None else None if config.clip_per_video is not None: subsampling = config.clip_per_video self.subsampling = subsampling def _get_text_maxlen(self): # use max text len return self.text_clip_sampler.max_text_len def __call__(self, video_id, video_feature, text_feature): from transformers import default_data_collator video_idx = video_id[1] if self.subsampling is not None and self.subsampling >= 1: batch = [] for _ in range(self.subsampling): centerclip_idx = random.randint( 0, len(text_feature["start"]) - 1) batch.append( self.sampling( video_idx, video_feature, text_feature, centerclip_idx, self._get_text_maxlen() )) batch = self.batch_post_processing(batch, video_feature) batch = default_data_collator(batch) else: raise ValueError( "dataset.subsampling must be >= 1 for efficient video loading.") batch = self.sampling(video_idx, video_feature, text_feature) batch = self.batch_post_processing(batch, video_feature) batch["video_id"] = video_id if isinstance(video_id, str) \ else video_id[0] # e2e: make sure frame ids is into tensor. assert torch.is_tensor(batch["vfeats"]) return batch def sampling( self, video_idx, video_feature, text_feature, centerclip_idx=None, sampled_max_text_len=None, ): text_clip_indexs = self.text_clip_sampler( text_feature, centerclip_idx, sampled_max_text_len ) if isinstance(video_feature, np.ndarray): video_len = len(video_feature) else: video_len = math.ceil(text_feature["end"][-1]) video_end = min( math.ceil(text_feature["end"][text_clip_indexs[-1]]), video_len ) video_start = max( min( math.floor(text_feature["start"][text_clip_indexs[0]]), video_end), 0 ) video_clips = {"start": [video_start], "end": [video_end]} # tensorize. vfeats, vmasks = self._build_video_seq( video_feature, video_clips ) caps, cmasks = self._build_text_seq( text_feature, text_clip_indexs ) text_start = text_clip_indexs[0] text_end = text_clip_indexs[-1] + 1 return { "caps": caps, "cmasks": cmasks, "vfeats": vfeats, "vmasks": vmasks, "video_start": video_start, "video_end": video_end, "text_start": text_start, "text_end": text_end, } class VariedLenAligner(FixedLenAligner): def __init__(self, config): super().__init__(config) self.sampled_min_len = config.sampled_min_len self.sampled_max_len = config.sampled_max_len def _get_text_maxlen(self): return random.randint(self.sampled_min_len, self.sampled_max_len) class StartClipAligner(VariedLenAligner): def sampling( self, video_idx, video_feature, text_feature, centerclip_idx=None, sampled_max_text_len=None, ): return super().sampling( video_idx, video_feature, text_feature, 0) class OverlappedAligner(VariedLenAligner): """video clip and text clip has overlappings but may not be the same start/end.""" def __init__(self, config): super().__init__(config) self.sampled_video_min_len = config.sampled_video_min_len self.sampled_video_max_len = config.sampled_video_max_len self.video_clip_sampler = VideoClipSamplingProcessor() def _get_video_maxlen(self): return random.randint( self.sampled_video_min_len, self.sampled_video_max_len) def sampling( self, video_idx, video_feature, text_feature, centerclip_idx=None, sampled_max_text_len=None, ): text_clip_indexs = self.text_clip_sampler( text_feature, centerclip_idx, sampled_max_text_len ) if isinstance(video_feature, np.ndarray): video_len = len(video_feature) else: video_len = math.ceil(text_feature["end"][-1]) low = math.floor(text_feature["start"][text_clip_indexs[0]]) high = math.ceil(text_feature["end"][text_clip_indexs[-1]]) if low < high: center = random.randint(low, high) else: center = int((low + high) // 2) center = max(0, min(video_feature.shape[0] - 1, center)) assert 0 <= center < video_feature.shape[0] video_clips = self.video_clip_sampler( video_len, self._get_video_maxlen(), center ) video_start = video_clips["start"][0] video_end = video_clips["end"][0] # tensorize. vfeats, vmasks = self._build_video_seq( video_feature, video_clips ) caps, cmasks = self._build_text_seq( text_feature, text_clip_indexs ) text_start = text_clip_indexs[0] text_end = text_clip_indexs[-1] + 1 return { "caps": caps, "cmasks": cmasks, "vfeats": vfeats, "vmasks": vmasks, "video_start": video_start, "video_end": video_end, "text_start": text_start, "text_end": text_end, } class MFMMLMAligner(FixedLenAligner): """ `FixedLenAligner` with Masked Language Model and Masked Frame Model. """ def __init__(self, config): super().__init__(config) keep_prob = config.keep_prob if config.keep_prob is not None else 1.0 self.text_clip_sampler = TextClipSamplingProcessor( self.max_len - self.max_video_len - 3, keep_prob ) self.sampled_min_len = config.sampled_min_len self.sampled_max_len = config.sampled_max_len self.masked_token_sampler = TextMaskingProcessor(config) self.mm_type = config.mm_type \ if config.mm_type is not None else "full" self.attnmasker = MMAttentionMask2DProcessor() \ if self.mm_type == "textgen" else None self.masked_frame_sampler = FrameMaskingProcessor(config) self.lazy_vfeat_mask = ( False if config.lazy_vfeat_mask is None else config.lazy_vfeat_mask ) self.mm_prob = config.mm_prob if config.mm_prob is not None else 0. def __call__(self, video_id, video_feature, text_feature): from transformers import default_data_collator if self.subsampling is not None and self.subsampling > 1: batch = [] for _ in range(self.subsampling): centerclip_idx = random.randint( 0, len(text_feature["start"]) - 1) sampled_max_text_len = random.randint( self.sampled_min_len, self.sampled_max_len ) batch.append( self.sampling( video_id, video_feature, text_feature, centerclip_idx, sampled_max_text_len, ) ) batch = self.batch_post_processing(batch, video_feature) batch = default_data_collator(batch) else: batch = self.sampling(video_id, video_feature, text_feature) batch = self.batch_post_processing(batch, video_feature) batch["video_id"] = video_id if isinstance(video_id, str) \ else video_id[0] return batch def sampling( self, video_id, video_feature, text_feature, centerclip_idx=None, sampled_max_text_len=None, ): output = FixedLenAligner.sampling(self, video_id, video_feature, text_feature, centerclip_idx, sampled_max_text_len) masking_text, masking_video = None, None if random.random() < self.mm_prob: if random.random() > 0.5: masking_text, masking_video = self.mm_type, "no" else: masking_text, masking_video = "no", "full" video_feats = output["vfeats"] if not self.lazy_vfeat_mask else None video_label = self.masked_frame_sampler( output["vmasks"], masking_video, vfeats=video_feats) caps, text_label = self.masked_token_sampler( output["caps"], masking_text) output.update({ "caps": caps, "video_label": video_label, "text_label": text_label, }) if self.attnmasker is not None: attention_mask = self.attnmasker( output["vmasks"], output["cmasks"], masking_text) output.update({ "attention_mask": attention_mask }) return output class FrameMaskingProcessor(Processor): def __init__(self, config): self.mfm_probability = 0.15 if config.mfm_probability is not None: self.mfm_probability = config.mfm_probability def __call__(self, vmasks, modality_masking=None, vfeats=None): """ We perform lazy masking to save data transfer time. It only generates video_labels by default and MFM model will do actualy masking. Return: `video_label` is a binary mask. """ video_label = vmasks.clone() if modality_masking is not None: if modality_masking == "full": probability_matrix = torch.full(video_label.shape, 1.) elif modality_masking == "no": probability_matrix = torch.full(video_label.shape, 0.) elif modality_masking == "inverse": probability_matrix = torch.full( video_label.shape, 1. - self.mfm_probability) else: raise ValueError("unknown modality masking.", modality_masking) else: probability_matrix = torch.full( video_label.shape, self.mfm_probability) masked_indices = torch.bernoulli(probability_matrix).bool() # We only compute loss on masked tokens video_label[~masked_indices] = 0 if vfeats is not None: vfeats[video_label, :] = 0.0 return video_label class TextGenerationProcessor(Processor): def __init__(self, tokenizer): self.bos_token_id = tokenizer.bos_token_id self.pad_token_id = tokenizer.pad_token_id def __call__(self, inputs): labels = inputs.clone() # [CLS] [SEP] for video labels[:2] = -100 # keep [SEP] for text. pad_mask = labels == self.pad_token_id labels[pad_mask] = -100 inputs[2:] = torch.cat([ torch.LongTensor([self.bos_token_id]), inputs[2:-1]]) inputs[pad_mask] = self.pad_token_id assert len(inputs) == len(labels) return inputs, labels class TextMaskingProcessor(Processor): def __init__(self, config): """this function is borrowed from `transformers/data/data_collator.DataCollatorForLanguageModeling`""" self.mlm_probability = 0.15 if config.mlm_probability is not None: self.mlm_probability = config.mlm_probability self.bert_name = config.bert_name # [CLS] is used as bos_token and [SEP] is used as eos_token. # https://huggingface.co/transformers/master/model_doc/bertgeneration.html from transformers import AutoTokenizer self.tokenizer = AutoTokenizer.from_pretrained( self.bert_name, bos_token="[CLS]", eos_token="[SEP]") self.textgen = TextGenerationProcessor(self.tokenizer) def __call__( self, inputs: torch.Tensor, modality_masking=None, special_tokens_mask: Optional[torch.Tensor] = None ) -> Tuple[torch.Tensor, torch.Tensor]: """ expand modality_masking into None: traditional bert masking. "no": no masking. "full": all [MASK] token for generation. "gen": autoregressive generation. """ """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ labels = inputs.clone() # We sample a few tokens in each sequence for MLM training # (with probability `self.mlm_probability`) if modality_masking is not None: if modality_masking == "full": probability_matrix = torch.full(labels.shape, 1.) elif modality_masking == "no": probability_matrix = torch.full(labels.shape, 0.) elif modality_masking.startswith("textgen"): # [CLS] [SEP] ... inputs, labels = self.textgen(inputs) if "mask" not in modality_masking: return inputs, labels inputs = self.mask_input(inputs, special_tokens_mask) return inputs, labels elif modality_masking == "mask": inputs = self.mask_input(inputs, special_tokens_mask) labels = torch.full(inputs.shape, -100) return inputs, labels elif modality_masking == "inverse": probability_matrix = torch.full(labels.shape, 1. - self.mlm_probability) else: raise ValueError("unknown modality masking.", modality_masking) else: probability_matrix = torch.full(labels.shape, self.mlm_probability) if special_tokens_mask is None: special_tokens_mask = self.get_special_tokens_mask( labels.tolist(), already_has_special_tokens=True ) special_tokens_mask = torch.tensor( special_tokens_mask, dtype=torch.bool) else: special_tokens_mask = special_tokens_mask.bool() probability_matrix.masked_fill_(special_tokens_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, # we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = ( torch.bernoulli( torch.full(labels.shape, 0.8)).bool() & masked_indices ) inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids( self.tokenizer.mask_token ) # 10% of the time, we replace masked input tokens with random word indices_random = ( torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced ) random_words = torch.randint( len(self.tokenizer), labels.shape, dtype=torch.long ) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input # tokens unchanged return inputs, labels def mask_input(self, inputs, special_tokens_mask=None): # the following is new with masked autoregressive. probability_matrix = torch.full( inputs.shape, self.mlm_probability) if special_tokens_mask is None: special_tokens_mask = self.get_special_tokens_mask( inputs.tolist(), already_has_special_tokens=True ) special_tokens_mask = torch.tensor( special_tokens_mask, dtype=torch.bool) else: special_tokens_mask = special_tokens_mask.bool() probability_matrix.masked_fill_(special_tokens_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() indices_replaced = ( torch.bernoulli( torch.full(inputs.shape, 0.8)).bool() & masked_indices ) inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids( self.tokenizer.mask_token ) # 10% of the time, we replace masked input tokens with random word indices_random = ( torch.bernoulli(torch.full(inputs.shape, 0.5)).bool() & masked_indices & ~indices_replaced ) random_words = torch.randint( len(self.tokenizer), inputs.shape, dtype=torch.long ) inputs[indices_random] = random_words[indices_random] return inputs def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Note: the version from transformers do not consider pad as special tokens. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if" "the provided sequence of " "ids is already formated with special tokens " "for the model." ) return list(map(lambda x: 1 if x in [ self.tokenizer.sep_token_id, self.tokenizer.cls_token_id, self.tokenizer.pad_token_id] else 0, token_ids_0)) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] class TextClipSamplingProcessor(Processor): def __init__(self, max_text_len, keep_prob=1.0): self.max_text_len = max_text_len self.max_video_len = 256 # always hold. self.keep_prob = keep_prob def __call__( self, text_feature, centerclip_idx=None, sampled_max_text_len=None, sampled_max_video_len=None, ): # Let's use all caps for now and see if 256 can cover all of them. if sampled_max_text_len is not None: max_text_len = sampled_max_text_len else: max_text_len = self.max_text_len if sampled_max_video_len is not None: max_video_len = sampled_max_video_len else: max_video_len = self.max_video_len t_num_clips = len(text_feature["start"]) if centerclip_idx is None: centerclip_idx = random.randint(0, t_num_clips - 1) start_idx, end_idx = centerclip_idx, centerclip_idx + 1 text_clip_indexs = deque() text_clip_indexs.append(start_idx) text_len = len(text_feature["cap"][start_idx]) video_len = max( 0, text_feature["end"][start_idx] - text_feature["start"][start_idx], ) while ( (start_idx > 0 or end_idx < t_num_clips) and text_len < max_text_len and video_len < max_video_len ): if random.random() > 0.5 and end_idx < t_num_clips: # skip the next one? if random.random() > self.keep_prob and (end_idx + 1) < t_num_clips: end_idx = end_idx + 1 text_clip_indexs.append(end_idx) text_len += len(text_feature["cap"][end_idx]) end_idx += 1 elif start_idx > 0: if random.random() > self.keep_prob and (start_idx - 1) > 0: start_idx = start_idx - 1 start_idx -= 1 text_clip_indexs.insert(0, start_idx) text_len += len(text_feature["cap"][start_idx]) else: if end_idx < t_num_clips: if random.random() > self.keep_prob and (end_idx + 1) < t_num_clips: end_idx = end_idx + 1 text_clip_indexs.append(end_idx) text_len += len(text_feature["cap"][end_idx]) end_idx += 1 else: return text_clip_indexs video_len = max( 0, text_feature["end"][text_clip_indexs[-1]] - text_feature["start"][text_clip_indexs[0]], ) return text_clip_indexs class VideoClipSamplingProcessor(Processor): def __call__(self, video_len, max_video_len, center): """ `video_len`: length of the video. `max_video_len`: maximum video tokens allowd in a sequence. `center`: initial starting index. """ assert center >= 0 and center < video_len t_clip_len = 0 start, end = center, center while (start > 0 or end < video_len) and t_clip_len < max_video_len: # decide the direction to grow. if start <= 0: end += 1 elif end >= video_len: start -= 1 elif random.random() > 0.5: end += 1 else: start -= 1 t_clip_len += 1 return {"start": [start], "end": [end]} class How2MILNCEAligner(FixedLenAligner): """reference: `antoine77340/MIL-NCE_HowTo100M/video_loader.py`""" def __init__(self, config): super().__init__(config) self.num_candidates = 4 self.min_time = 5.0 self.num_sec = 3.2 # self.num_sec = self.num_frames / float(self.fps) num_frames=16 / fps = 5 # self.num_frames = 16 def sampling( self, video_id, video_feature, text_feature, centerclip_idx=None, # will be ignored. sampled_max_text_len=None # will be ignored. ): text, start, end = self._get_text(text_feature) video = self._get_video(video_feature, start, end) vfeats = torch.zeros((self.max_video_len, video_feature.shape[1])) vmasks = torch.zeros((self.max_video_len,), dtype=torch.bool) vfeats[: video.shape[0]] = torch.from_numpy(np.array(video)) vmasks[: video.shape[0]] = 1 caps, cmasks = [], [] for words in text: cap, cmask = self._build_text_seq(text_feature, words) caps.append(cap) cmasks.append(cmask) caps = torch.stack(caps) cmasks = torch.stack(cmasks) # video of shape: (video_len) # text of shape (num_candidates, max_text_len) return { "caps": caps, "cmasks": cmasks, "vfeats": vfeats, "vmasks": vmasks, # "video_id": video_id, } def _get_video(self, video_feature, start, end): start_seek = random.randint(start, int(max(start, end - self.num_sec))) # duration = self.num_sec + 0.1 return video_feature[start_seek : int(start_seek + self.num_sec)] def _get_text(self, cap): ind = random.randint(0, len(cap["start"]) - 1) if self.num_candidates == 1: words = [ind] else: words = [] cap_start = self._find_nearest_candidates(cap, ind) for i in range(self.num_candidates): words.append([max(0, min(len(cap["cap"]) - 1, cap_start + i))]) start, end = cap["start"][ind], cap["end"][ind] # TODO: May need to be improved for edge cases. # expand the min time. if end - start < self.min_time: diff = self.min_time - end + start start = max(0, start - diff / 2) end = start + self.min_time return words, int(start), int(end) def _find_nearest_candidates(self, caption, ind): """find the range of the clips.""" start, end = ind, ind #diff = caption["end"][end] - caption["start"][start] n_candidate = 1 while n_candidate < self.num_candidates: # the first clip if start == 0: return 0 # we add () in the following condition to fix the bug. elif end == (len(caption["start"]) - 1): return start - (self.num_candidates - n_candidate) elif (caption["end"][end] - caption["start"][start - 1]) < ( caption["end"][end + 1] - caption["start"][start] ): start -= 1 else: end += 1 n_candidate += 1 return start class PKLJSONStrTextProcessor(TextProcessor): """`caption.json` from howto100m are preprocessed as a dict `[video_id, json_str]`. Json parsing tokenization are conducted on-the-fly and cached into dict. """ def __init__(self, config, max_clip_text_len=96): print("[Warning] PKLJSONStrTextProcessor is slow for num_workers > 0.") self.caption_pkl_path = str(config.caption_pkl_path) with open(self.caption_pkl_path, "rb") as fd: self.data = pickle.load(fd) self.max_clip_text_len = max_clip_text_len from transformers import AutoTokenizer self.tokenizer = AutoTokenizer.from_pretrained( str(config.bert_name), use_fast=config.use_fast ) def __call__(self, video_id): caption = self.data[video_id] if isinstance(caption, str): import json caption = json.loads(caption) cap = [] for clip_idx, text_clip in enumerate(caption["text"]): clip_ids = [] if isinstance(text_clip, str): clip_ids = self.tokenizer( text_clip[: self.max_clip_text_len], add_special_tokens=False )["input_ids"] cap.append(clip_ids) caption["cap"] = cap caption.pop("text") # save space. self.data[video_id] = caption return caption